llvm-project/llvm/test/CodeGen/X86/extract-insert.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=X86
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64

define i32 @extractelt_undef_insertelt(i32 %x, i32 %y) {
; CHECK-LABEL: extractelt_undef_insertelt:
; CHECK:       # %bb.0:
; CHECK-NEXT:    ret{{[l|q]}}
  %b = insertelement <4 x i32> zeroinitializer, i32 %x, i64 3
  %c = icmp uge i32 %y, %y
  %d = extractelement <4 x i32> %b, i1 %c
  ret i32 %d
}

define i8 @extractelt_bitcast(i32 %x) nounwind {
; X86-LABEL: extractelt_bitcast:
; X86:       # %bb.0:
; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
; X86-NEXT:    retl
;
; X64-LABEL: extractelt_bitcast:
; X64:       # %bb.0:
; X64-NEXT:    movl %edi, %eax
; X64-NEXT:    # kill: def $al killed $al killed $eax
; X64-NEXT:    retq
  %bc = bitcast i32 %x to <4 x i8>
  %ext = extractelement <4 x i8> %bc, i32 0
  ret i8 %ext
}

; TODO: This should have folded to avoid vector ops, but the transform
; is guarded by 'hasOneUse'. That limitation apparently makes some AMDGPU
; codegen better.

define i8 @extractelt_bitcast_extra_use(i32 %x, <4 x i8>* %p) nounwind {
; X86-LABEL: extractelt_bitcast_extra_use:
; X86:       # %bb.0:
; X86-NEXT:    pushl %eax
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    movd %eax, %xmm0
; X86-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT:    movl %eax, (%ecx)
; X86-NEXT:    movd %xmm0, %eax
; X86-NEXT:    # kill: def $al killed $al killed $eax
; X86-NEXT:    popl %ecx
; X86-NEXT:    retl
;
; X64-LABEL: extractelt_bitcast_extra_use:
; X64:       # %bb.0:
; X64-NEXT:    movd %edi, %xmm0
; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-NEXT:    movl %edi, (%rsi)
; X64-NEXT:    movd %xmm0, %eax
; X64-NEXT:    # kill: def $al killed $al killed $eax
; X64-NEXT:    retq
  %bc = bitcast i32 %x to <4 x i8>
  store <4 x i8> %bc, <4 x i8>* %p
  %ext = extractelement <4 x i8> %bc, i32 0
  ret i8 %ext
}

define i32 @trunc_i64_to_i32_le(i64 %x) {
; X86-LABEL: trunc_i64_to_i32_le:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    retl
;
; X64-LABEL: trunc_i64_to_i32_le:
; X64:       # %bb.0:
; X64-NEXT:    movq %rdi, %rax
; X64-NEXT:    # kill: def $eax killed $eax killed $rax
; X64-NEXT:    retq
  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
  %bc = bitcast <2 x i64> %ins to <4 x i32>
  %ext = extractelement <4 x i32> %bc, i32 0
  ret i32 %ext
}

define i16 @trunc_i64_to_i16_le(i64 %x) {
; X86-LABEL: trunc_i64_to_i16_le:
; X86:       # %bb.0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    # kill: def $ax killed $ax killed $eax
; X86-NEXT:    retl
;
; X64-LABEL: trunc_i64_to_i16_le:
; X64:       # %bb.0:
; X64-NEXT:    movq %rdi, %rax
; X64-NEXT:    # kill: def $ax killed $ax killed $rax
; X64-NEXT:    retq
  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
  %bc = bitcast <2 x i64> %ins to <8 x i16>
  %ext = extractelement <8 x i16> %bc, i32 0
  ret i16 %ext
}

define i8 @trunc_i32_to_i8_le(i32 %x) {
; X86-LABEL: trunc_i32_to_i8_le:
; X86:       # %bb.0:
; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
; X86-NEXT:    retl
;
; X64-LABEL: trunc_i32_to_i8_le:
; X64:       # %bb.0:
; X64-NEXT:    movl %edi, %eax
; X64-NEXT:    # kill: def $al killed $al killed $eax
; X64-NEXT:    retq
  %ins = insertelement <4 x i32> undef, i32 %x, i32 0
  %bc = bitcast <4 x i32> %ins to <16 x i8>
  %ext = extractelement <16 x i8> %bc, i32 0
  ret i8 %ext
}
[DAGCombine] Handle out of range EXTRACT_VECTOR_ELT indices Handle this in DAGCombiner::visitEXTRACT_VECTOR_ELT the same as we already do in SelectionDAG::getNode and use APInt instead of getZExtValue. This should also fix oss-fuzz #4910 llvm-svn: 321767 2018-01-04 06:42:33 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
			`; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 \| FileCheck %s --check-prefix=CHECK --check-prefix=X86`
			`; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 \| FileCheck %s --check-prefix=CHECK --check-prefix=X64`

			`define i32 @extractelt_undef_insertelt(i32 %x, i32 %y) {`
[x86] regenerate CHECKs; NFC llvm-svn: 344301 2018-10-12 05:44:38 +08:00			`; CHECK-LABEL: extractelt_undef_insertelt:`
			`; CHECK: # %bb.0:`
			`; CHECK-NEXT: ret{{[l\|q]}}`
[DAGCombine] Handle out of range EXTRACT_VECTOR_ELT indices Handle this in DAGCombiner::visitEXTRACT_VECTOR_ELT the same as we already do in SelectionDAG::getNode and use APInt instead of getZExtValue. This should also fix oss-fuzz #4910 llvm-svn: 321767 2018-01-04 06:42:33 +08:00			`%b = insertelement <4 x i32> zeroinitializer, i32 %x, i64 3`
			`%c = icmp uge i32 %y, %y`
			`%d = extractelement <4 x i32> %b, i1 %c`
			`ret i32 %d`
			`}`

[x86] add tests for extract_element; NFC The transform for this pattern has an unnecessary one-use limitation. llvm-svn: 344303 2018-10-12 06:04:36 +08:00			`define i8 @extractelt_bitcast(i32 %x) nounwind {`
			`; X86-LABEL: extractelt_bitcast:`
			`; X86: # %bb.0:`
			`; X86-NEXT: movb {{[0-9]+}}(%esp), %al`
			`; X86-NEXT: retl`
			`;`
			`; X64-LABEL: extractelt_bitcast:`
			`; X64: # %bb.0:`
			`; X64-NEXT: movl %edi, %eax`
			`; X64-NEXT: # kill: def $al killed $al killed $eax`
			`; X64-NEXT: retq`
			`%bc = bitcast i32 %x to <4 x i8>`
			`%ext = extractelement <4 x i8> %bc, i32 0`
			`ret i8 %ext`
			`}`

[DAGCombiner] rearrange extract_element+bitcast fold; NFC I want to add another pattern here that includes scalar_to_vector, so this makes that patch smaller. I was hoping to remove the hasOneUse() check because it shouldn't be necessary for common codegen, but an AMDGPU test has a comment suggesting that the extra check makes things better on one of those targets. llvm-svn: 344320 2018-10-12 07:56:56 +08:00			`; TODO: This should have folded to avoid vector ops, but the transform`
[AArch64][x86] add tests for trunc disguised as vector ops (PR39016); NFC These correspond to the IR transform from: D52439 llvm-svn: 344353 2018-10-12 23:22:14 +08:00			`; is guarded by 'hasOneUse'. That limitation apparently makes some AMDGPU`
[DAGCombiner] rearrange extract_element+bitcast fold; NFC I want to add another pattern here that includes scalar_to_vector, so this makes that patch smaller. I was hoping to remove the hasOneUse() check because it shouldn't be necessary for common codegen, but an AMDGPU test has a comment suggesting that the extra check makes things better on one of those targets. llvm-svn: 344320 2018-10-12 07:56:56 +08:00			`; codegen better.`

[x86] add tests for extract_element; NFC The transform for this pattern has an unnecessary one-use limitation. llvm-svn: 344303 2018-10-12 06:04:36 +08:00			`define i8 @extractelt_bitcast_extra_use(i32 %x, <4 x i8>* %p) nounwind {`
			`; X86-LABEL: extractelt_bitcast_extra_use:`
			`; X86: # %bb.0:`
			`; X86-NEXT: pushl %eax`
			`; X86-NEXT: movl {{[0-9]+}}(%esp), %eax`
			`; X86-NEXT: movd %eax, %xmm0`
			`; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]`
			`; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx`
			`; X86-NEXT: movl %eax, (%ecx)`
			`; X86-NEXT: movd %xmm0, %eax`
			`; X86-NEXT: # kill: def $al killed $al killed $eax`
			`; X86-NEXT: popl %ecx`
			`; X86-NEXT: retl`
			`;`
			`; X64-LABEL: extractelt_bitcast_extra_use:`
			`; X64: # %bb.0:`
			`; X64-NEXT: movd %edi, %xmm0`
			`; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]`
			`; X64-NEXT: movl %edi, (%rsi)`
			`; X64-NEXT: movd %xmm0, %eax`
			`; X64-NEXT: # kill: def $al killed $al killed $eax`
			`; X64-NEXT: retq`
			`%bc = bitcast i32 %x to <4 x i8>`
			`store <4 x i8> %bc, <4 x i8>* %p`
			`%ext = extractelement <4 x i8> %bc, i32 0`
			`ret i8 %ext`
			`}`

[AArch64][x86] add tests for trunc disguised as vector ops (PR39016); NFC These correspond to the IR transform from: D52439 llvm-svn: 344353 2018-10-12 23:22:14 +08:00			`define i32 @trunc_i64_to_i32_le(i64 %x) {`
			`; X86-LABEL: trunc_i64_to_i32_le:`
			`; X86: # %bb.0:`
			`; X86-NEXT: movl {{[0-9]+}}(%esp), %eax`
			`; X86-NEXT: retl`
			`;`
			`; X64-LABEL: trunc_i64_to_i32_le:`
			`; X64: # %bb.0:`
[DAGCombiner] reduce insert+bitcast+extract vector ops to truncate (PR39016) This is a late backend subset of the IR transform added with: D52439 We can confirm that the conversion to a 'trunc' is correct by running: $ opt -instcombine -data-layout="e" (assuming the IR transforms are correct; change "e" to "E" for big-endian) As discussed in PR39016: https://bugs.llvm.org/show_bug.cgi?id=39016 ...the pattern may emerge during legalization, so that's we are waiting for an insertelement to become a scalar_to_vector in the pattern matching here. The DAG allows for fun variations that are not possible in IR. Result types for extracts and scalar_to_vector don't necessarily match input types, so that means we have to be a bit more careful in the transform (see code comments). The tests show that we don't handle cases that require a shift (as we did in the IR version). I've left that as a potential follow-up because I'm not sure if that's a real concern at this late stage. Differential Revision: https://reviews.llvm.org/D53201 llvm-svn: 344872 2018-10-22 04:13:29 +08:00			`; X64-NEXT: movq %rdi, %rax`
			`; X64-NEXT: # kill: def $eax killed $eax killed $rax`
[AArch64][x86] add tests for trunc disguised as vector ops (PR39016); NFC These correspond to the IR transform from: D52439 llvm-svn: 344353 2018-10-12 23:22:14 +08:00			`; X64-NEXT: retq`
			`%ins = insertelement <2 x i64> undef, i64 %x, i32 0`
			`%bc = bitcast <2 x i64> %ins to <4 x i32>`
			`%ext = extractelement <4 x i32> %bc, i32 0`
			`ret i32 %ext`
			`}`

			`define i16 @trunc_i64_to_i16_le(i64 %x) {`
			`; X86-LABEL: trunc_i64_to_i16_le:`
			`; X86: # %bb.0:`
			`; X86-NEXT: movl {{[0-9]+}}(%esp), %eax`
			`; X86-NEXT: # kill: def $ax killed $ax killed $eax`
			`; X86-NEXT: retl`
			`;`
			`; X64-LABEL: trunc_i64_to_i16_le:`
			`; X64: # %bb.0:`
[DAGCombiner] reduce insert+bitcast+extract vector ops to truncate (PR39016) This is a late backend subset of the IR transform added with: D52439 We can confirm that the conversion to a 'trunc' is correct by running: $ opt -instcombine -data-layout="e" (assuming the IR transforms are correct; change "e" to "E" for big-endian) As discussed in PR39016: https://bugs.llvm.org/show_bug.cgi?id=39016 ...the pattern may emerge during legalization, so that's we are waiting for an insertelement to become a scalar_to_vector in the pattern matching here. The DAG allows for fun variations that are not possible in IR. Result types for extracts and scalar_to_vector don't necessarily match input types, so that means we have to be a bit more careful in the transform (see code comments). The tests show that we don't handle cases that require a shift (as we did in the IR version). I've left that as a potential follow-up because I'm not sure if that's a real concern at this late stage. Differential Revision: https://reviews.llvm.org/D53201 llvm-svn: 344872 2018-10-22 04:13:29 +08:00			`; X64-NEXT: movq %rdi, %rax`
			`; X64-NEXT: # kill: def $ax killed $ax killed $rax`
[AArch64][x86] add tests for trunc disguised as vector ops (PR39016); NFC These correspond to the IR transform from: D52439 llvm-svn: 344353 2018-10-12 23:22:14 +08:00			`; X64-NEXT: retq`
			`%ins = insertelement <2 x i64> undef, i64 %x, i32 0`
			`%bc = bitcast <2 x i64> %ins to <8 x i16>`
			`%ext = extractelement <8 x i16> %bc, i32 0`
			`ret i16 %ext`
			`}`

			`define i8 @trunc_i32_to_i8_le(i32 %x) {`
			`; X86-LABEL: trunc_i32_to_i8_le:`
			`; X86: # %bb.0:`
			`; X86-NEXT: movb {{[0-9]+}}(%esp), %al`
			`; X86-NEXT: retl`
			`;`
			`; X64-LABEL: trunc_i32_to_i8_le:`
			`; X64: # %bb.0:`
			`; X64-NEXT: movl %edi, %eax`
			`; X64-NEXT: # kill: def $al killed $al killed $eax`
			`; X64-NEXT: retq`
			`%ins = insertelement <4 x i32> undef, i32 %x, i32 0`
			`%bc = bitcast <4 x i32> %ins to <16 x i8>`
			`%ext = extractelement <16 x i8> %bc, i32 0`
			`ret i8 %ext`
			`}`