llvm-project/llvm/test/CodeGen/X86/mmx-coalescing.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s

%SA = type <{ %union.anon, i32, [4 x i8], i8*, i8*, i8*, i32, [4 x i8] }>
%union.anon = type { <1 x i64> }

; Check that extra movd (copy) instructions aren't generated.

define i32 @test(%SA* %pSA, i16* %A, i32 %B, i32 %C, i32 %D, i8* %E) {
; CHECK-LABEL: test:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    pshufw $238, (%rdi), %mm0 # mm0 = mem[2,3,2,3]
; CHECK-NEXT:    movd %mm0, %eax
; CHECK-NEXT:    testl %eax, %eax
; CHECK-NEXT:    je .LBB0_1
; CHECK-NEXT:  # %bb.2: # %if.B
; CHECK-NEXT:    pshufw $238, %mm0, %mm0 # mm0 = mm0[2,3,2,3]
; CHECK-NEXT:    movq %mm0, %rax
; CHECK-NEXT:    testl %eax, %eax
; CHECK-NEXT:    jne .LBB0_4
; CHECK-NEXT:  .LBB0_1: # %if.A
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    movd %edx, %mm1
; CHECK-NEXT:    psllq %mm1, %mm0
; CHECK-NEXT:    movq %mm0, %rax
; CHECK-NEXT:    testq %rax, %rax
; CHECK-NEXT:    jne .LBB0_4
; CHECK-NEXT:  # %bb.3: # %if.C
; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT:    testl %eax, %eax
; CHECK-NEXT:    je .LBB0_1
; CHECK-NEXT:  .LBB0_4: # %merge
; CHECK-NEXT:    pshufw $238, %mm0, %mm0 # mm0 = mm0[2,3,2,3]
; CHECK-NEXT:    movd %mm0, %eax
; CHECK-NEXT:    retq
entry:
  %shl = shl i32 1, %B
  %shl1 = shl i32 %C, %B
  %shl2 = shl i32 1, %D
  %v = getelementptr inbounds %SA, %SA* %pSA, i64 0, i32 0, i32 0
  %v0 = load <1 x i64>, <1 x i64>* %v, align 8
  %SA0 = getelementptr inbounds %SA, %SA* %pSA, i64 0, i32 1
  %v1 = load i32, i32* %SA0, align 4
  %SA1 = getelementptr inbounds %SA, %SA* %pSA, i64 0, i32 3
  %v2 = load i8*, i8** %SA1, align 8
  %SA2 = getelementptr inbounds %SA, %SA* %pSA, i64 0, i32 4
  %v3 = load i8*, i8** %SA2, align 8
  %v4 = bitcast <1 x i64> %v0 to <4 x i16>
  %v5 = bitcast <4 x i16> %v4 to x86_mmx
  %v6 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v5, i8 -18)
  %v7 = bitcast x86_mmx %v6 to <4 x i16>
  %v8 = bitcast <4 x i16> %v7 to <1 x i64>
  %v9 = extractelement <1 x i64> %v8, i32 0
  %v10 = bitcast i64 %v9 to <2 x i32>
  %v11 = extractelement <2 x i32> %v10, i32 0
  %cmp = icmp eq i32 %v11, 0
  br i1 %cmp, label %if.A, label %if.B

if.A:
  %pa = phi <1 x i64> [ %v8, %entry ], [ %vx, %if.C ]
  %v17 = extractelement <1 x i64> %pa, i32 0
  %v18 = bitcast i64 %v17 to x86_mmx
  %v19 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %v18, i32 %B) #2
  %v20 = bitcast x86_mmx %v19 to i64
  %v21 = insertelement <1 x i64> undef, i64 %v20, i32 0
  %cmp3 = icmp eq i64 %v20, 0
  br i1 %cmp3, label %if.C, label %merge

if.B:
  %v34 = bitcast <1 x i64> %v8 to <4 x i16>
  %v35 = bitcast <4 x i16> %v34 to x86_mmx
  %v36 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v35, i8 -18)
  %v37 = bitcast x86_mmx %v36 to <4 x i16>
  %v38 = bitcast <4 x i16> %v37 to <1 x i64>
  br label %if.C

if.C:
  %vx = phi <1 x i64> [ %v21, %if.A ], [ %v38, %if.B ]
  %cvt = bitcast <1 x i64> %vx to <2 x i32>
  %ex = extractelement <2 x i32> %cvt, i32 0
  %cmp2 = icmp eq i32 %ex, 0
  br i1 %cmp2, label %if.A, label %merge

merge:
  %vy = phi <1 x i64> [ %v21, %if.A ], [ %vx, %if.C ]
  %v130 = bitcast <1 x i64> %vy to <4 x i16>
  %v131 = bitcast <4 x i16> %v130 to x86_mmx
  %v132 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v131, i8 -18)
  %v133 = bitcast x86_mmx %v132 to <4 x i16>
  %v134 = bitcast <4 x i16> %v133 to <1 x i64>
  %v135 = extractelement <1 x i64> %v134, i32 0
  %v136 = bitcast i64 %v135 to <2 x i32>
  %v137 = extractelement <2 x i32> %v136, i32 0
  ret i32 %v137
}


declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)
declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
[X86] Regenerate MMX coalescing test Exposes another extractelement(bitcast(scalartovector())) pattern llvm-svn: 343403 2018-09-30 17:42:04 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
[PeepholeOptimizer] Look through PHIs to find additional register sources Reintroduce r245442. Remove an overly conservative assertion introduced in r245442. We could replace the assertion to use `shareSameRegisterFile` instead, but in that point in `insertPHI` we already lost the original Def subreg to check against. So drop the assertion completely. Original commit message: - Teaches the ValueTracker in the PeepholeOptimizer to look through PHI instructions. - Add findNextSourceAndRewritePHI method to lookup into multiple sources returnted by the ValueTracker and rewrite PHIs with new sources. With these changes we can find more register sources and rewrite more copies to allow coaslescing of bitcast instructions. Hence, we eliminate unnecessary VR64 <-> GR64 copies in x86, but it could be extended to other archs by marking "isBitcast" on target specific instructions. The x86 example follows: A: psllq %mm1, %mm0 movd %mm0, %r9 jmp C B: por %mm1, %mm0 movd %mm0, %r9 jmp C C: movd %r9, %mm0 pshufw $238, %mm0, %mm0 Becomes: A: psllq %mm1, %mm0 jmp C B: por %mm1, %mm0 jmp C C: pshufw $238, %mm0, %mm0 Differential Revision: http://reviews.llvm.org/D11197 rdar://problem/20404526 llvm-svn: 245479 2015-08-20 02:53:36 +08:00			`; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 \| FileCheck %s`

			`%SA = type <{ %union.anon, i32, [4 x i8], i8, i8, i8*, i32, [4 x i8] }>`
			`%union.anon = type { <1 x i64> }`

			`; Check that extra movd (copy) instructions aren't generated.`

			`define i32 @test(%SA* %pSA, i16* %A, i32 %B, i32 %C, i32 %D, i8* %E) {`
[X86] Regenerate MMX coalescing test Exposes another extractelement(bitcast(scalartovector())) pattern llvm-svn: 343403 2018-09-30 17:42:04 +08:00			`; CHECK-LABEL: test:`
			`; CHECK: # %bb.0: # %entry`
			`; CHECK-NEXT: pshufw $238, (%rdi), %mm0 # mm0 = mem[2,3,2,3]`
			`; CHECK-NEXT: movd %mm0, %eax`
			`; CHECK-NEXT: testl %eax, %eax`
			`; CHECK-NEXT: je .LBB0_1`
			`; CHECK-NEXT: # %bb.2: # %if.B`
			`; CHECK-NEXT: pshufw $238, %mm0, %mm0 # mm0 = mm0[2,3,2,3]`
			`; CHECK-NEXT: movq %mm0, %rax`
[DAGCombiner] reduce insert+bitcast+extract vector ops to truncate (PR39016) This is a late backend subset of the IR transform added with: D52439 We can confirm that the conversion to a 'trunc' is correct by running: $ opt -instcombine -data-layout="e" (assuming the IR transforms are correct; change "e" to "E" for big-endian) As discussed in PR39016: https://bugs.llvm.org/show_bug.cgi?id=39016 ...the pattern may emerge during legalization, so that's we are waiting for an insertelement to become a scalar_to_vector in the pattern matching here. The DAG allows for fun variations that are not possible in IR. Result types for extracts and scalar_to_vector don't necessarily match input types, so that means we have to be a bit more careful in the transform (see code comments). The tests show that we don't handle cases that require a shift (as we did in the IR version). I've left that as a potential follow-up because I'm not sure if that's a real concern at this late stage. Differential Revision: https://reviews.llvm.org/D53201 llvm-svn: 344872 2018-10-22 04:13:29 +08:00			`; CHECK-NEXT: testl %eax, %eax`
			`; CHECK-NEXT: jne .LBB0_4`
[X86] Regenerate MMX coalescing test Exposes another extractelement(bitcast(scalartovector())) pattern llvm-svn: 343403 2018-09-30 17:42:04 +08:00			`; CHECK-NEXT: .LBB0_1: # %if.A`
[DAGCombiner] reduce insert+bitcast+extract vector ops to truncate (PR39016) This is a late backend subset of the IR transform added with: D52439 We can confirm that the conversion to a 'trunc' is correct by running: $ opt -instcombine -data-layout="e" (assuming the IR transforms are correct; change "e" to "E" for big-endian) As discussed in PR39016: https://bugs.llvm.org/show_bug.cgi?id=39016 ...the pattern may emerge during legalization, so that's we are waiting for an insertelement to become a scalar_to_vector in the pattern matching here. The DAG allows for fun variations that are not possible in IR. Result types for extracts and scalar_to_vector don't necessarily match input types, so that means we have to be a bit more careful in the transform (see code comments). The tests show that we don't handle cases that require a shift (as we did in the IR version). I've left that as a potential follow-up because I'm not sure if that's a real concern at this late stage. Differential Revision: https://reviews.llvm.org/D53201 llvm-svn: 344872 2018-10-22 04:13:29 +08:00			`; CHECK-NEXT: # =>This Inner Loop Header: Depth=1`
[X86] Regenerate MMX coalescing test Exposes another extractelement(bitcast(scalartovector())) pattern llvm-svn: 343403 2018-09-30 17:42:04 +08:00			`; CHECK-NEXT: movd %edx, %mm1`
			`; CHECK-NEXT: psllq %mm1, %mm0`
			`; CHECK-NEXT: movq %mm0, %rax`
			`; CHECK-NEXT: testq %rax, %rax`
			`; CHECK-NEXT: jne .LBB0_4`
[DAGCombiner] reduce insert+bitcast+extract vector ops to truncate (PR39016) This is a late backend subset of the IR transform added with: D52439 We can confirm that the conversion to a 'trunc' is correct by running: $ opt -instcombine -data-layout="e" (assuming the IR transforms are correct; change "e" to "E" for big-endian) As discussed in PR39016: https://bugs.llvm.org/show_bug.cgi?id=39016 ...the pattern may emerge during legalization, so that's we are waiting for an insertelement to become a scalar_to_vector in the pattern matching here. The DAG allows for fun variations that are not possible in IR. Result types for extracts and scalar_to_vector don't necessarily match input types, so that means we have to be a bit more careful in the transform (see code comments). The tests show that we don't handle cases that require a shift (as we did in the IR version). I've left that as a potential follow-up because I'm not sure if that's a real concern at this late stage. Differential Revision: https://reviews.llvm.org/D53201 llvm-svn: 344872 2018-10-22 04:13:29 +08:00			`; CHECK-NEXT: # %bb.3: # %if.C`
			`; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1`
[X86] Regenerate MMX coalescing test Exposes another extractelement(bitcast(scalartovector())) pattern llvm-svn: 343403 2018-09-30 17:42:04 +08:00			`; CHECK-NEXT: testl %eax, %eax`
			`; CHECK-NEXT: je .LBB0_1`
			`; CHECK-NEXT: .LBB0_4: # %merge`
			`; CHECK-NEXT: pshufw $238, %mm0, %mm0 # mm0 = mm0[2,3,2,3]`
			`; CHECK-NEXT: movd %mm0, %eax`
			`; CHECK-NEXT: retq`
[PeepholeOptimizer] Look through PHIs to find additional register sources Reintroduce r245442. Remove an overly conservative assertion introduced in r245442. We could replace the assertion to use `shareSameRegisterFile` instead, but in that point in `insertPHI` we already lost the original Def subreg to check against. So drop the assertion completely. Original commit message: - Teaches the ValueTracker in the PeepholeOptimizer to look through PHI instructions. - Add findNextSourceAndRewritePHI method to lookup into multiple sources returnted by the ValueTracker and rewrite PHIs with new sources. With these changes we can find more register sources and rewrite more copies to allow coaslescing of bitcast instructions. Hence, we eliminate unnecessary VR64 <-> GR64 copies in x86, but it could be extended to other archs by marking "isBitcast" on target specific instructions. The x86 example follows: A: psllq %mm1, %mm0 movd %mm0, %r9 jmp C B: por %mm1, %mm0 movd %mm0, %r9 jmp C C: movd %r9, %mm0 pshufw $238, %mm0, %mm0 Becomes: A: psllq %mm1, %mm0 jmp C B: por %mm1, %mm0 jmp C C: pshufw $238, %mm0, %mm0 Differential Revision: http://reviews.llvm.org/D11197 rdar://problem/20404526 llvm-svn: 245479 2015-08-20 02:53:36 +08:00			`entry:`
			`%shl = shl i32 1, %B`
			`%shl1 = shl i32 %C, %B`
			`%shl2 = shl i32 1, %D`
			`%v = getelementptr inbounds %SA, %SA* %pSA, i64 0, i32 0, i32 0`
			`%v0 = load <1 x i64>, <1 x i64>* %v, align 8`
			`%SA0 = getelementptr inbounds %SA, %SA* %pSA, i64 0, i32 1`
			`%v1 = load i32, i32* %SA0, align 4`
			`%SA1 = getelementptr inbounds %SA, %SA* %pSA, i64 0, i32 3`
			`%v2 = load i8, i8* %SA1, align 8`
			`%SA2 = getelementptr inbounds %SA, %SA* %pSA, i64 0, i32 4`
			`%v3 = load i8, i8* %SA2, align 8`
			`%v4 = bitcast <1 x i64> %v0 to <4 x i16>`
			`%v5 = bitcast <4 x i16> %v4 to x86_mmx`
			`%v6 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v5, i8 -18)`
			`%v7 = bitcast x86_mmx %v6 to <4 x i16>`
			`%v8 = bitcast <4 x i16> %v7 to <1 x i64>`
			`%v9 = extractelement <1 x i64> %v8, i32 0`
			`%v10 = bitcast i64 %v9 to <2 x i32>`
			`%v11 = extractelement <2 x i32> %v10, i32 0`
			`%cmp = icmp eq i32 %v11, 0`
			`br i1 %cmp, label %if.A, label %if.B`

			`if.A:`
			`%pa = phi <1 x i64> [ %v8, %entry ], [ %vx, %if.C ]`
			`%v17 = extractelement <1 x i64> %pa, i32 0`
			`%v18 = bitcast i64 %v17 to x86_mmx`
			`%v19 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %v18, i32 %B) #2`
			`%v20 = bitcast x86_mmx %v19 to i64`
			`%v21 = insertelement <1 x i64> undef, i64 %v20, i32 0`
			`%cmp3 = icmp eq i64 %v20, 0`
			`br i1 %cmp3, label %if.C, label %merge`

			`if.B:`
			`%v34 = bitcast <1 x i64> %v8 to <4 x i16>`
			`%v35 = bitcast <4 x i16> %v34 to x86_mmx`
			`%v36 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v35, i8 -18)`
			`%v37 = bitcast x86_mmx %v36 to <4 x i16>`
			`%v38 = bitcast <4 x i16> %v37 to <1 x i64>`
			`br label %if.C`

			`if.C:`
			`%vx = phi <1 x i64> [ %v21, %if.A ], [ %v38, %if.B ]`
			`%cvt = bitcast <1 x i64> %vx to <2 x i32>`
			`%ex = extractelement <2 x i32> %cvt, i32 0`
			`%cmp2 = icmp eq i32 %ex, 0`
			`br i1 %cmp2, label %if.A, label %merge`

			`merge:`
			`%vy = phi <1 x i64> [ %v21, %if.A ], [ %vx, %if.C ]`
			`%v130 = bitcast <1 x i64> %vy to <4 x i16>`
			`%v131 = bitcast <4 x i16> %v130 to x86_mmx`
			`%v132 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v131, i8 -18)`
			`%v133 = bitcast x86_mmx %v132 to <4 x i16>`
			`%v134 = bitcast <4 x i16> %v133 to <1 x i64>`
			`%v135 = extractelement <1 x i64> %v134, i32 0`
			`%v136 = bitcast i64 %v135 to <2 x i32>`
			`%v137 = extractelement <2 x i32> %v136, i32 0`
			`ret i32 %v137`
			`}`


			`declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)`
			`declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)`