2017-06-19 05:48:44 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
2018-07-04 21:58:13 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE41
|
2008-12-19 04:05:58 +08:00
|
|
|
|
|
|
|
; Widen a v5i16 to v8i16 to do a vector sub and multiple
|
|
|
|
|
|
|
|
define void @update(<5 x i16>* %dst, <5 x i16>* %src, i32 %n) nounwind {
|
2018-07-04 21:58:13 +08:00
|
|
|
; SSE2-LABEL: update:
|
|
|
|
; SSE2: # %bb.0: # %entry
|
|
|
|
; SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
|
|
|
|
; SSE2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
|
|
|
|
; SSE2-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
|
2019-03-05 03:12:16 +08:00
|
|
|
; SSE2-NEXT: movabsq $4295032833, %rax # imm = 0x100010001
|
2018-07-04 21:58:13 +08:00
|
|
|
; SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
|
|
|
|
; SSE2-NEXT: movw $0, -{{[0-9]+}}(%rsp)
|
|
|
|
; SSE2-NEXT: movl $0, -{{[0-9]+}}(%rsp)
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = <271,271,271,271,271,u,u,u>
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = <2,4,2,2,2,u,u,u>
|
|
|
|
; SSE2-NEXT: .p2align 4, 0x90
|
2019-08-30 03:03:58 +08:00
|
|
|
; SSE2-NEXT: .LBB0_1: # %forcond
|
|
|
|
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
|
|
|
|
; SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
|
|
|
|
; SSE2-NEXT: cmpl -{{[0-9]+}}(%rsp), %eax
|
|
|
|
; SSE2-NEXT: jge .LBB0_3
|
|
|
|
; SSE2-NEXT: # %bb.2: # %forbody
|
2018-07-04 21:58:13 +08:00
|
|
|
; SSE2-NEXT: # in Loop: Header=BB0_1 Depth=1
|
|
|
|
; SSE2-NEXT: movslq -{{[0-9]+}}(%rsp), %rax
|
|
|
|
; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
|
|
|
|
; SSE2-NEXT: shlq $4, %rax
|
|
|
|
; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
|
|
|
|
; SSE2-NEXT: movdqa (%rdx,%rax), %xmm2
|
|
|
|
; SSE2-NEXT: psubw %xmm0, %xmm2
|
|
|
|
; SSE2-NEXT: pmullw %xmm1, %xmm2
|
|
|
|
; SSE2-NEXT: movq %xmm2, (%rcx,%rax)
|
|
|
|
; SSE2-NEXT: pextrw $4, %xmm2, %edx
|
|
|
|
; SSE2-NEXT: movw %dx, 8(%rcx,%rax)
|
|
|
|
; SSE2-NEXT: incl -{{[0-9]+}}(%rsp)
|
2019-08-30 03:03:58 +08:00
|
|
|
; SSE2-NEXT: jmp .LBB0_1
|
|
|
|
; SSE2-NEXT: .LBB0_3: # %afterfor
|
2018-07-04 21:58:13 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: update:
|
|
|
|
; SSE41: # %bb.0: # %entry
|
|
|
|
; SSE41-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
|
|
|
|
; SSE41-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
|
|
|
|
; SSE41-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
|
2019-03-05 03:12:16 +08:00
|
|
|
; SSE41-NEXT: movabsq $4295032833, %rax # imm = 0x100010001
|
2018-07-04 21:58:13 +08:00
|
|
|
; SSE41-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
|
|
|
|
; SSE41-NEXT: movw $0, -{{[0-9]+}}(%rsp)
|
|
|
|
; SSE41-NEXT: movl $0, -{{[0-9]+}}(%rsp)
|
|
|
|
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = <271,271,271,271,271,u,u,u>
|
|
|
|
; SSE41-NEXT: .p2align 4, 0x90
|
2019-08-30 03:03:58 +08:00
|
|
|
; SSE41-NEXT: .LBB0_1: # %forcond
|
|
|
|
; SSE41-NEXT: # =>This Inner Loop Header: Depth=1
|
|
|
|
; SSE41-NEXT: movl -{{[0-9]+}}(%rsp), %eax
|
|
|
|
; SSE41-NEXT: cmpl -{{[0-9]+}}(%rsp), %eax
|
|
|
|
; SSE41-NEXT: jge .LBB0_3
|
|
|
|
; SSE41-NEXT: # %bb.2: # %forbody
|
2018-07-04 21:58:13 +08:00
|
|
|
; SSE41-NEXT: # in Loop: Header=BB0_1 Depth=1
|
|
|
|
; SSE41-NEXT: movslq -{{[0-9]+}}(%rsp), %rax
|
|
|
|
; SSE41-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
|
|
|
|
; SSE41-NEXT: shlq $4, %rax
|
|
|
|
; SSE41-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
|
[X86][SSE] Prefer BLEND(SHL(v,c1),SHL(v,c2)) over MUL(v, c3)
Now that rL336250 has landed, we should prefer 2 immediate shifts + a shuffle blend over performing a multiply. Despite the increase in instructions, this is quicker (especially for slow v4i32 multiplies), avoid loads and constant pool usage. It does mean however that we increase register pressure. The code size will go up a little but by less than what we save on the constant pool data.
This patch also adds support for v16i16 to the BLEND(SHIFT(v,c1),SHIFT(v,c2)) combine, and also prevents blending on pre-SSE41 shifts if it would introduce extra blend masks/constant pool usage.
Differential Revision: https://reviews.llvm.org/D48936
llvm-svn: 336642
2018-07-10 15:58:33 +08:00
|
|
|
; SSE41-NEXT: movdqa (%rdx,%rax), %xmm1
|
|
|
|
; SSE41-NEXT: psubw %xmm0, %xmm1
|
|
|
|
; SSE41-NEXT: movdqa %xmm1, %xmm2
|
|
|
|
; SSE41-NEXT: psllw $2, %xmm2
|
|
|
|
; SSE41-NEXT: psllw $1, %xmm1
|
|
|
|
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
|
|
|
|
; SSE41-NEXT: pextrw $4, %xmm1, 8(%rcx,%rax)
|
2018-07-04 21:58:13 +08:00
|
|
|
; SSE41-NEXT: movq %xmm2, (%rcx,%rax)
|
|
|
|
; SSE41-NEXT: incl -{{[0-9]+}}(%rsp)
|
2019-08-30 03:03:58 +08:00
|
|
|
; SSE41-NEXT: jmp .LBB0_1
|
|
|
|
; SSE41-NEXT: .LBB0_3: # %afterfor
|
2018-07-04 21:58:13 +08:00
|
|
|
; SSE41-NEXT: retq
|
2008-12-19 04:05:58 +08:00
|
|
|
entry:
|
2017-06-19 05:48:44 +08:00
|
|
|
%dst.addr = alloca <5 x i16>*
|
|
|
|
%src.addr = alloca <5 x i16>*
|
|
|
|
%n.addr = alloca i32
|
|
|
|
%v = alloca <5 x i16>, align 16
|
|
|
|
%i = alloca i32, align 4
|
2008-12-19 04:05:58 +08:00
|
|
|
store <5 x i16>* %dst, <5 x i16>** %dst.addr
|
|
|
|
store <5 x i16>* %src, <5 x i16>** %src.addr
|
|
|
|
store i32 %n, i32* %n.addr
|
|
|
|
store <5 x i16> < i16 1, i16 1, i16 1, i16 0, i16 0 >, <5 x i16>* %v
|
|
|
|
store i32 0, i32* %i
|
|
|
|
br label %forcond
|
|
|
|
|
2017-06-19 05:48:44 +08:00
|
|
|
forcond:
|
|
|
|
%tmp = load i32, i32* %i
|
|
|
|
%tmp1 = load i32, i32* %n.addr
|
|
|
|
%cmp = icmp slt i32 %tmp, %tmp1
|
2008-12-19 04:05:58 +08:00
|
|
|
br i1 %cmp, label %forbody, label %afterfor
|
|
|
|
|
2017-06-19 05:48:44 +08:00
|
|
|
forbody:
|
|
|
|
%tmp2 = load i32, i32* %i
|
|
|
|
%tmp3 = load <5 x i16>*, <5 x i16>** %dst.addr
|
|
|
|
%arrayidx = getelementptr <5 x i16>, <5 x i16>* %tmp3, i32 %tmp2
|
|
|
|
%tmp4 = load i32, i32* %i
|
|
|
|
%tmp5 = load <5 x i16>*, <5 x i16>** %src.addr
|
|
|
|
%arrayidx6 = getelementptr <5 x i16>, <5 x i16>* %tmp5, i32 %tmp4
|
|
|
|
%tmp7 = load <5 x i16>, <5 x i16>* %arrayidx6
|
|
|
|
%sub = sub <5 x i16> %tmp7, < i16 271, i16 271, i16 271, i16 271, i16 271 >
|
|
|
|
%mul = mul <5 x i16> %sub, < i16 2, i16 4, i16 2, i16 2, i16 2 >
|
2008-12-19 04:05:58 +08:00
|
|
|
store <5 x i16> %mul, <5 x i16>* %arrayidx
|
|
|
|
br label %forinc
|
|
|
|
|
2017-06-19 05:48:44 +08:00
|
|
|
forinc:
|
|
|
|
%tmp8 = load i32, i32* %i
|
|
|
|
%inc = add i32 %tmp8, 1
|
2008-12-19 04:05:58 +08:00
|
|
|
store i32 %inc, i32* %i
|
|
|
|
br label %forcond
|
|
|
|
|
2017-06-19 05:48:44 +08:00
|
|
|
afterfor:
|
2008-12-19 04:05:58 +08:00
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|