llvm-project/llvm/test/CodeGen/X86/widen_arith-4.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE41

; Widen a v5i16 to v8i16 to do a vector sub and multiple

define void @update(<5 x i16>* %dst, <5 x i16>* %src, i32 %n) nounwind {
; SSE2-LABEL: update:
; SSE2:       # %bb.0: # %entry
; SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; SSE2-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
; SSE2-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
; SSE2-NEXT:    movabsq $4295032833, %rax # imm = 0x100010001
; SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; SSE2-NEXT:    movw $0, -{{[0-9]+}}(%rsp)
; SSE2-NEXT:    movl $0, -{{[0-9]+}}(%rsp)
; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = <271,271,271,271,271,u,u,u>
; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = <2,4,2,2,2,u,u,u>
; SSE2-NEXT:    .p2align 4, 0x90
; SSE2-NEXT:  .LBB0_1: # %forcond
; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
; SSE2-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT:    cmpl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT:    jge .LBB0_3
; SSE2-NEXT:  # %bb.2: # %forbody
; SSE2-NEXT:    # in Loop: Header=BB0_1 Depth=1
; SSE2-NEXT:    movslq -{{[0-9]+}}(%rsp), %rax
; SSE2-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
; SSE2-NEXT:    shlq $4, %rax
; SSE2-NEXT:    movq -{{[0-9]+}}(%rsp), %rdx
; SSE2-NEXT:    movdqa (%rdx,%rax), %xmm2
; SSE2-NEXT:    psubw %xmm0, %xmm2
; SSE2-NEXT:    pmullw %xmm1, %xmm2
; SSE2-NEXT:    movq %xmm2, (%rcx,%rax)
; SSE2-NEXT:    pextrw $4, %xmm2, %edx
; SSE2-NEXT:    movw %dx, 8(%rcx,%rax)
; SSE2-NEXT:    incl -{{[0-9]+}}(%rsp)
; SSE2-NEXT:    jmp .LBB0_1
; SSE2-NEXT:  .LBB0_3: # %afterfor
; SSE2-NEXT:    retq
;
; SSE41-LABEL: update:
; SSE41:       # %bb.0: # %entry
; SSE41-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; SSE41-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
; SSE41-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
; SSE41-NEXT:    movabsq $4295032833, %rax # imm = 0x100010001
; SSE41-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; SSE41-NEXT:    movw $0, -{{[0-9]+}}(%rsp)
; SSE41-NEXT:    movl $0, -{{[0-9]+}}(%rsp)
; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = <271,271,271,271,271,u,u,u>
; SSE41-NEXT:    .p2align 4, 0x90
; SSE41-NEXT:  .LBB0_1: # %forcond
; SSE41-NEXT:    # =>This Inner Loop Header: Depth=1
; SSE41-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
; SSE41-NEXT:    cmpl -{{[0-9]+}}(%rsp), %eax
; SSE41-NEXT:    jge .LBB0_3
; SSE41-NEXT:  # %bb.2: # %forbody
; SSE41-NEXT:    # in Loop: Header=BB0_1 Depth=1
; SSE41-NEXT:    movslq -{{[0-9]+}}(%rsp), %rax
; SSE41-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
; SSE41-NEXT:    shlq $4, %rax
; SSE41-NEXT:    movq -{{[0-9]+}}(%rsp), %rdx
; SSE41-NEXT:    movdqa (%rdx,%rax), %xmm1
; SSE41-NEXT:    psubw %xmm0, %xmm1
; SSE41-NEXT:    movdqa %xmm1, %xmm2
; SSE41-NEXT:    psllw $2, %xmm2
; SSE41-NEXT:    psllw $1, %xmm1
; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
; SSE41-NEXT:    pextrw $4, %xmm1, 8(%rcx,%rax)
; SSE41-NEXT:    movq %xmm2, (%rcx,%rax)
; SSE41-NEXT:    incl -{{[0-9]+}}(%rsp)
; SSE41-NEXT:    jmp .LBB0_1
; SSE41-NEXT:  .LBB0_3: # %afterfor
; SSE41-NEXT:    retq
entry:
	%dst.addr = alloca <5 x i16>*
	%src.addr = alloca <5 x i16>*
	%n.addr = alloca i32
	%v = alloca <5 x i16>, align 16
	%i = alloca i32, align 4
	store <5 x i16>* %dst, <5 x i16>** %dst.addr
	store <5 x i16>* %src, <5 x i16>** %src.addr
	store i32 %n, i32* %n.addr
	store <5 x i16> < i16 1, i16 1, i16 1, i16 0, i16 0 >, <5 x i16>* %v
	store i32 0, i32* %i
	br label %forcond

forcond:
	%tmp = load i32, i32* %i
	%tmp1 = load i32, i32* %n.addr
	%cmp = icmp slt i32 %tmp, %tmp1
	br i1 %cmp, label %forbody, label %afterfor

forbody:
	%tmp2 = load i32, i32* %i
	%tmp3 = load <5 x i16>*, <5 x i16>** %dst.addr
	%arrayidx = getelementptr <5 x i16>, <5 x i16>* %tmp3, i32 %tmp2
	%tmp4 = load i32, i32* %i
	%tmp5 = load <5 x i16>*, <5 x i16>** %src.addr
	%arrayidx6 = getelementptr <5 x i16>, <5 x i16>* %tmp5, i32 %tmp4
	%tmp7 = load <5 x i16>, <5 x i16>* %arrayidx6
	%sub = sub <5 x i16> %tmp7, < i16 271, i16 271, i16 271, i16 271, i16 271 >
	%mul = mul <5 x i16> %sub, < i16 2, i16 4, i16 2, i16 2, i16 2 >
	store <5 x i16> %mul, <5 x i16>* %arrayidx
	br label %forinc

forinc:
	%tmp8 = load i32, i32* %i
	%inc = add i32 %tmp8, 1
	store i32 %inc, i32* %i
	br label %forcond

afterfor:
	ret void
}
[x86] specify triples and auto-generate complete checks; NFC llvm-svn: 305656 2017-06-19 05:48:44 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
[X86][SSE] Add SSE2 target to some shift tests Show the difference in behaviour cf SSE41 (no PMULLD, PBLENDW etc.) Raised by D48936 llvm-svn: 336271 2018-07-04 21:58:13 +08:00			`; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 \| FileCheck %s --check-prefixes=CHECK,SSE2`
			`; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 \| FileCheck %s --check-prefixes=CHECK,SSE41`
Added some basic test cases for r61209 llvm-svn: 61210 2008-12-19 04:05:58 +08:00
			`; Widen a v5i16 to v8i16 to do a vector sub and multiple`

			`define void @update(<5 x i16>* %dst, <5 x i16>* %src, i32 %n) nounwind {`
[X86][SSE] Add SSE2 target to some shift tests Show the difference in behaviour cf SSE41 (no PMULLD, PBLENDW etc.) Raised by D48936 llvm-svn: 336271 2018-07-04 21:58:13 +08:00			`; SSE2-LABEL: update:`
			`; SSE2: # %bb.0: # %entry`
			`; SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)`
			`; SSE2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)`
			`; SSE2-NEXT: movl %edx, -{{[0-9]+}}(%rsp)`
[DAGCombiner][X86][SystemZ][AArch64] Combine some cases of (bitcast (build_vector constants)) between legalize types and legalize dag. This patch enables combining integer bitcasts of integer build vectors when the new scalar type is legal. I've avoided floating point because the implementation bitcasts float to int along the way and we would need to check the intermediate types for legality Differential Revision: https://reviews.llvm.org/D58884 llvm-svn: 355324 2019-03-05 03:12:16 +08:00			`; SSE2-NEXT: movabsq $4295032833, %rax # imm = 0x100010001`
[X86][SSE] Add SSE2 target to some shift tests Show the difference in behaviour cf SSE41 (no PMULLD, PBLENDW etc.) Raised by D48936 llvm-svn: 336271 2018-07-04 21:58:13 +08:00			`; SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)`
			`; SSE2-NEXT: movw $0, -{{[0-9]+}}(%rsp)`
			`; SSE2-NEXT: movl $0, -{{[0-9]+}}(%rsp)`
			`; SSE2-NEXT: movdqa {{.*#+}} xmm0 = <271,271,271,271,271,u,u,u>`
			`; SSE2-NEXT: movdqa {{.*#+}} xmm1 = <2,4,2,2,2,u,u,u>`
			`; SSE2-NEXT: .p2align 4, 0x90`
Revert [MBP] Disable aggressive loop rotate in plain mode This reverts r369664 (git commit 51f48295cbe8fa3a44db263b528dd9f7bae7bf9a) It causes many benchmark regressions, internally and in llvm's benchmark suite. llvm-svn: 370398 2019-08-30 03:03:58 +08:00			`; SSE2-NEXT: .LBB0_1: # %forcond`
			`; SSE2-NEXT: # =>This Inner Loop Header: Depth=1`
			`; SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax`
			`; SSE2-NEXT: cmpl -{{[0-9]+}}(%rsp), %eax`
			`; SSE2-NEXT: jge .LBB0_3`
			`; SSE2-NEXT: # %bb.2: # %forbody`
[X86][SSE] Add SSE2 target to some shift tests Show the difference in behaviour cf SSE41 (no PMULLD, PBLENDW etc.) Raised by D48936 llvm-svn: 336271 2018-07-04 21:58:13 +08:00			`; SSE2-NEXT: # in Loop: Header=BB0_1 Depth=1`
			`; SSE2-NEXT: movslq -{{[0-9]+}}(%rsp), %rax`
			`; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx`
			`; SSE2-NEXT: shlq $4, %rax`
			`; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rdx`
			`; SSE2-NEXT: movdqa (%rdx,%rax), %xmm2`
			`; SSE2-NEXT: psubw %xmm0, %xmm2`
			`; SSE2-NEXT: pmullw %xmm1, %xmm2`
			`; SSE2-NEXT: movq %xmm2, (%rcx,%rax)`
			`; SSE2-NEXT: pextrw $4, %xmm2, %edx`
			`; SSE2-NEXT: movw %dx, 8(%rcx,%rax)`
			`; SSE2-NEXT: incl -{{[0-9]+}}(%rsp)`
Revert [MBP] Disable aggressive loop rotate in plain mode This reverts r369664 (git commit 51f48295cbe8fa3a44db263b528dd9f7bae7bf9a) It causes many benchmark regressions, internally and in llvm's benchmark suite. llvm-svn: 370398 2019-08-30 03:03:58 +08:00			`; SSE2-NEXT: jmp .LBB0_1`
			`; SSE2-NEXT: .LBB0_3: # %afterfor`
[X86][SSE] Add SSE2 target to some shift tests Show the difference in behaviour cf SSE41 (no PMULLD, PBLENDW etc.) Raised by D48936 llvm-svn: 336271 2018-07-04 21:58:13 +08:00			`; SSE2-NEXT: retq`
			`;`
			`; SSE41-LABEL: update:`
			`; SSE41: # %bb.0: # %entry`
			`; SSE41-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)`
			`; SSE41-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)`
			`; SSE41-NEXT: movl %edx, -{{[0-9]+}}(%rsp)`
[DAGCombiner][X86][SystemZ][AArch64] Combine some cases of (bitcast (build_vector constants)) between legalize types and legalize dag. This patch enables combining integer bitcasts of integer build vectors when the new scalar type is legal. I've avoided floating point because the implementation bitcasts float to int along the way and we would need to check the intermediate types for legality Differential Revision: https://reviews.llvm.org/D58884 llvm-svn: 355324 2019-03-05 03:12:16 +08:00			`; SSE41-NEXT: movabsq $4295032833, %rax # imm = 0x100010001`
[X86][SSE] Add SSE2 target to some shift tests Show the difference in behaviour cf SSE41 (no PMULLD, PBLENDW etc.) Raised by D48936 llvm-svn: 336271 2018-07-04 21:58:13 +08:00			`; SSE41-NEXT: movq %rax, -{{[0-9]+}}(%rsp)`
			`; SSE41-NEXT: movw $0, -{{[0-9]+}}(%rsp)`
			`; SSE41-NEXT: movl $0, -{{[0-9]+}}(%rsp)`
			`; SSE41-NEXT: movdqa {{.*#+}} xmm0 = <271,271,271,271,271,u,u,u>`
			`; SSE41-NEXT: .p2align 4, 0x90`
Revert [MBP] Disable aggressive loop rotate in plain mode This reverts r369664 (git commit 51f48295cbe8fa3a44db263b528dd9f7bae7bf9a) It causes many benchmark regressions, internally and in llvm's benchmark suite. llvm-svn: 370398 2019-08-30 03:03:58 +08:00			`; SSE41-NEXT: .LBB0_1: # %forcond`
			`; SSE41-NEXT: # =>This Inner Loop Header: Depth=1`
			`; SSE41-NEXT: movl -{{[0-9]+}}(%rsp), %eax`
			`; SSE41-NEXT: cmpl -{{[0-9]+}}(%rsp), %eax`
			`; SSE41-NEXT: jge .LBB0_3`
			`; SSE41-NEXT: # %bb.2: # %forbody`
[X86][SSE] Add SSE2 target to some shift tests Show the difference in behaviour cf SSE41 (no PMULLD, PBLENDW etc.) Raised by D48936 llvm-svn: 336271 2018-07-04 21:58:13 +08:00			`; SSE41-NEXT: # in Loop: Header=BB0_1 Depth=1`
			`; SSE41-NEXT: movslq -{{[0-9]+}}(%rsp), %rax`
			`; SSE41-NEXT: movq -{{[0-9]+}}(%rsp), %rcx`
			`; SSE41-NEXT: shlq $4, %rax`
			`; SSE41-NEXT: movq -{{[0-9]+}}(%rsp), %rdx`
[X86][SSE] Prefer BLEND(SHL(v,c1),SHL(v,c2)) over MUL(v, c3) Now that rL336250 has landed, we should prefer 2 immediate shifts + a shuffle blend over performing a multiply. Despite the increase in instructions, this is quicker (especially for slow v4i32 multiplies), avoid loads and constant pool usage. It does mean however that we increase register pressure. The code size will go up a little but by less than what we save on the constant pool data. This patch also adds support for v16i16 to the BLEND(SHIFT(v,c1),SHIFT(v,c2)) combine, and also prevents blending on pre-SSE41 shifts if it would introduce extra blend masks/constant pool usage. Differential Revision: https://reviews.llvm.org/D48936 llvm-svn: 336642 2018-07-10 15:58:33 +08:00			`; SSE41-NEXT: movdqa (%rdx,%rax), %xmm1`
			`; SSE41-NEXT: psubw %xmm0, %xmm1`
			`; SSE41-NEXT: movdqa %xmm1, %xmm2`
			`; SSE41-NEXT: psllw $2, %xmm2`
			`; SSE41-NEXT: psllw $1, %xmm1`
			`; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]`
			`; SSE41-NEXT: pextrw $4, %xmm1, 8(%rcx,%rax)`
[X86][SSE] Add SSE2 target to some shift tests Show the difference in behaviour cf SSE41 (no PMULLD, PBLENDW etc.) Raised by D48936 llvm-svn: 336271 2018-07-04 21:58:13 +08:00			`; SSE41-NEXT: movq %xmm2, (%rcx,%rax)`
			`; SSE41-NEXT: incl -{{[0-9]+}}(%rsp)`
Revert [MBP] Disable aggressive loop rotate in plain mode This reverts r369664 (git commit 51f48295cbe8fa3a44db263b528dd9f7bae7bf9a) It causes many benchmark regressions, internally and in llvm's benchmark suite. llvm-svn: 370398 2019-08-30 03:03:58 +08:00			`; SSE41-NEXT: jmp .LBB0_1`
			`; SSE41-NEXT: .LBB0_3: # %afterfor`
[X86][SSE] Add SSE2 target to some shift tests Show the difference in behaviour cf SSE41 (no PMULLD, PBLENDW etc.) Raised by D48936 llvm-svn: 336271 2018-07-04 21:58:13 +08:00			`; SSE41-NEXT: retq`
Added some basic test cases for r61209 llvm-svn: 61210 2008-12-19 04:05:58 +08:00			`entry:`
[x86] specify triples and auto-generate complete checks; NFC llvm-svn: 305656 2017-06-19 05:48:44 +08:00			`%dst.addr = alloca <5 x i16>*`
			`%src.addr = alloca <5 x i16>*`
			`%n.addr = alloca i32`
			`%v = alloca <5 x i16>, align 16`
			`%i = alloca i32, align 4`
Added some basic test cases for r61209 llvm-svn: 61210 2008-12-19 04:05:58 +08:00			`store <5 x i16>* %dst, <5 x i16>** %dst.addr`
			`store <5 x i16>* %src, <5 x i16>** %src.addr`
			`store i32 %n, i32* %n.addr`
			`store <5 x i16> < i16 1, i16 1, i16 1, i16 0, i16 0 >, <5 x i16>* %v`
			`store i32 0, i32* %i`
			`br label %forcond`

[x86] specify triples and auto-generate complete checks; NFC llvm-svn: 305656 2017-06-19 05:48:44 +08:00			`forcond:`
			`%tmp = load i32, i32* %i`
			`%tmp1 = load i32, i32* %n.addr`
			`%cmp = icmp slt i32 %tmp, %tmp1`
Added some basic test cases for r61209 llvm-svn: 61210 2008-12-19 04:05:58 +08:00			`br i1 %cmp, label %forbody, label %afterfor`

[x86] specify triples and auto-generate complete checks; NFC llvm-svn: 305656 2017-06-19 05:48:44 +08:00			`forbody:`
			`%tmp2 = load i32, i32* %i`
			`%tmp3 = load <5 x i16>, <5 x i16>* %dst.addr`
			`%arrayidx = getelementptr <5 x i16>, <5 x i16>* %tmp3, i32 %tmp2`
			`%tmp4 = load i32, i32* %i`
			`%tmp5 = load <5 x i16>, <5 x i16>* %src.addr`
			`%arrayidx6 = getelementptr <5 x i16>, <5 x i16>* %tmp5, i32 %tmp4`
			`%tmp7 = load <5 x i16>, <5 x i16>* %arrayidx6`
			`%sub = sub <5 x i16> %tmp7, < i16 271, i16 271, i16 271, i16 271, i16 271 >`
			`%mul = mul <5 x i16> %sub, < i16 2, i16 4, i16 2, i16 2, i16 2 >`
Added some basic test cases for r61209 llvm-svn: 61210 2008-12-19 04:05:58 +08:00			`store <5 x i16> %mul, <5 x i16>* %arrayidx`
			`br label %forinc`

[x86] specify triples and auto-generate complete checks; NFC llvm-svn: 305656 2017-06-19 05:48:44 +08:00			`forinc:`
			`%tmp8 = load i32, i32* %i`
			`%inc = add i32 %tmp8, 1`
Added some basic test cases for r61209 llvm-svn: 61210 2008-12-19 04:05:58 +08:00			`store i32 %inc, i32* %i`
			`br label %forcond`

[x86] specify triples and auto-generate complete checks; NFC llvm-svn: 305656 2017-06-19 05:48:44 +08:00			`afterfor:`
Added some basic test cases for r61209 llvm-svn: 61210 2008-12-19 04:05:58 +08:00			`ret void`
			`}`