2016-08-10 20:26:40 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
2018-06-03 04:25:56 +08:00
|
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse -O3 | FileCheck %s --check-prefixes=CHECK,X86
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2,+sse -O3 | FileCheck %s --check-prefixes=CHECK,X64
|
|
|
|
|
2010-08-26 13:24:29 +08:00
|
|
|
; Tests for SSE1 and below, without SSE2+.
|
|
|
|
|
2010-08-26 14:57:07 +08:00
|
|
|
; PR7993
|
|
|
|
;define <4 x i32> @test3(<4 x i16> %a) nounwind {
|
|
|
|
; %c = sext <4 x i16> %a to <4 x i32> ; <<4 x i32>> [#uses=1]
|
|
|
|
; ret <4 x i32> %c
|
|
|
|
;}
|
fix the BuildVector -> unpcklps logic to not do pointless shuffles
when the top elements of a vector are undefined. This happens all
the time for X86-64 ABI stuff because only the low 2 elements of
a 4 element vector are defined. For example, on:
_Complex float f32(_Complex float A, _Complex float B) {
return A+B;
}
We used to produce (with SSE2, SSE4.1+ uses insertps):
_f32: ## @f32
movdqa %xmm0, %xmm2
addss %xmm1, %xmm2
pshufd $16, %xmm2, %xmm2
pshufd $1, %xmm1, %xmm1
pshufd $1, %xmm0, %xmm0
addss %xmm1, %xmm0
pshufd $16, %xmm0, %xmm1
movdqa %xmm2, %xmm0
unpcklps %xmm1, %xmm0
ret
We now produce:
_f32: ## @f32
movdqa %xmm0, %xmm2
addss %xmm1, %xmm2
pshufd $1, %xmm1, %xmm1
pshufd $1, %xmm0, %xmm3
addss %xmm1, %xmm3
movaps %xmm2, %xmm0
unpcklps %xmm3, %xmm0
ret
This implements rdar://8368414
llvm-svn: 112378
2010-08-29 01:28:30 +08:00
|
|
|
|
|
|
|
; This should not emit shuffles to populate the top 2 elements of the 4-element
|
|
|
|
; vector that this ends up returning.
|
|
|
|
; rdar://8368414
|
|
|
|
define <2 x float> @test4(<2 x float> %A, <2 x float> %B) nounwind {
|
2018-06-03 04:25:56 +08:00
|
|
|
; CHECK-LABEL: test4:
|
|
|
|
; CHECK: # %bb.0: # %entry
|
|
|
|
; CHECK-NEXT: movaps %xmm0, %xmm2
|
|
|
|
; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3]
|
|
|
|
; CHECK-NEXT: addss %xmm1, %xmm0
|
|
|
|
; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
|
|
|
|
; CHECK-NEXT: subss %xmm1, %xmm2
|
|
|
|
; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
|
|
|
; CHECK-NEXT: ret{{[l|q]}}
|
fix the BuildVector -> unpcklps logic to not do pointless shuffles
when the top elements of a vector are undefined. This happens all
the time for X86-64 ABI stuff because only the low 2 elements of
a 4 element vector are defined. For example, on:
_Complex float f32(_Complex float A, _Complex float B) {
return A+B;
}
We used to produce (with SSE2, SSE4.1+ uses insertps):
_f32: ## @f32
movdqa %xmm0, %xmm2
addss %xmm1, %xmm2
pshufd $16, %xmm2, %xmm2
pshufd $1, %xmm1, %xmm1
pshufd $1, %xmm0, %xmm0
addss %xmm1, %xmm0
pshufd $16, %xmm0, %xmm1
movdqa %xmm2, %xmm0
unpcklps %xmm1, %xmm0
ret
We now produce:
_f32: ## @f32
movdqa %xmm0, %xmm2
addss %xmm1, %xmm2
pshufd $1, %xmm1, %xmm1
pshufd $1, %xmm0, %xmm3
addss %xmm1, %xmm3
movaps %xmm2, %xmm0
unpcklps %xmm3, %xmm0
ret
This implements rdar://8368414
llvm-svn: 112378
2010-08-29 01:28:30 +08:00
|
|
|
entry:
|
|
|
|
%tmp7 = extractelement <2 x float> %A, i32 0
|
|
|
|
%tmp5 = extractelement <2 x float> %A, i32 1
|
|
|
|
%tmp3 = extractelement <2 x float> %B, i32 0
|
|
|
|
%tmp1 = extractelement <2 x float> %B, i32 1
|
|
|
|
%add.r = fadd float %tmp7, %tmp3
|
|
|
|
%add.i = fsub float %tmp5, %tmp1
|
|
|
|
%tmp11 = insertelement <2 x float> undef, float %add.r, i32 0
|
|
|
|
%tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1
|
|
|
|
ret <2 x float> %tmp9
|
|
|
|
}
|
2014-03-08 07:25:55 +08:00
|
|
|
|
|
|
|
; We used to get stuck in type legalization for this example when lowering the
|
|
|
|
; vselect. With SSE1 v4f32 is a legal type but v4i1 (or any vector integer type)
|
|
|
|
; is not. We used to ping pong between splitting the vselect for the v4i
|
|
|
|
; condition operand and widening the resulting vselect for the v4f32 result.
|
|
|
|
; PR18036
|
|
|
|
|
|
|
|
define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
|
2018-06-03 04:25:56 +08:00
|
|
|
; X86-LABEL: vselect:
|
|
|
|
; X86: # %bb.0: # %entry
|
|
|
|
; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; X86-NEXT: je .LBB1_1
|
|
|
|
; X86-NEXT: # %bb.2: # %entry
|
|
|
|
; X86-NEXT: xorps %xmm1, %xmm1
|
|
|
|
; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: jne .LBB1_5
|
|
|
|
; X86-NEXT: .LBB1_4:
|
|
|
|
; X86-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: jne .LBB1_8
|
|
|
|
; X86-NEXT: .LBB1_7:
|
|
|
|
; X86-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
|
|
|
|
; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
|
|
|
; X86-NEXT: je .LBB1_10
|
|
|
|
; X86-NEXT: jmp .LBB1_11
|
|
|
|
; X86-NEXT: .LBB1_1:
|
|
|
|
; X86-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: je .LBB1_4
|
|
|
|
; X86-NEXT: .LBB1_5: # %entry
|
|
|
|
; X86-NEXT: xorps %xmm2, %xmm2
|
|
|
|
; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: je .LBB1_7
|
|
|
|
; X86-NEXT: .LBB1_8: # %entry
|
|
|
|
; X86-NEXT: xorps %xmm3, %xmm3
|
|
|
|
; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
|
|
|
; X86-NEXT: jne .LBB1_11
|
|
|
|
; X86-NEXT: .LBB1_10:
|
|
|
|
; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X86-NEXT: .LBB1_11: # %entry
|
|
|
|
; X86-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
|
|
; X86-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
|
|
|
; X86-NEXT: retl
|
2016-08-10 20:26:40 +08:00
|
|
|
;
|
|
|
|
; X64-LABEL: vselect:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0: # %entry
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: testl %edx, %edx
|
2016-08-10 20:26:40 +08:00
|
|
|
; X64-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; X64-NEXT: je .LBB1_1
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-NEXT: # %bb.2: # %entry
|
2016-08-10 20:26:40 +08:00
|
|
|
; X64-NEXT: xorps %xmm1, %xmm1
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: testl %ecx, %ecx
|
Codegen: Tail-duplicate during placement.
The tail duplication pass uses an assumed layout when making duplication
decisions. This is fine, but passes up duplication opportunities that
may arise when blocks are outlined. Because we want the updated CFG to
affect subsequent placement decisions, this change must occur during
placement.
In order to achieve this goal, TailDuplicationPass is split into a
utility class, TailDuplicator, and the pass itself. The pass delegates
nearly everything to the TailDuplicator object, except for looping over
the blocks in a function. This allows the same code to be used for tail
duplication in both places.
This change, in concert with outlining optional branches, allows
triangle shaped code to perform much better, esepecially when the
taken/untaken branches are correlated, as it creates a second spine when
the tests are small enough.
Issue from previous rollback fixed, and a new test was added for that
case as well. Issue was worklist/scheduling/taildup issue in layout.
Issue from 2nd rollback fixed, with 2 additional tests. Issue was
tail merging/loop info/tail-duplication causing issue with loops that share
a header block.
Issue with early tail-duplication of blocks that branch to a fallthrough
predecessor fixed with test case: tail-dup-branch-to-fallthrough.ll
Differential revision: https://reviews.llvm.org/D18226
llvm-svn: 283934
2016-10-12 04:36:43 +08:00
|
|
|
; X64-NEXT: jne .LBB1_5
|
Codegen: Make chains from trellis-shaped CFGs
Lay out trellis-shaped CFGs optimally.
A trellis of the shape below:
A B
|\ /|
| \ / |
| X |
| / \ |
|/ \|
C D
would be laid out A; B->C ; D by the current layout algorithm. Now we identify
trellises and lay them out either A->C; B->D or A->D; B->C. This scales with an
increasing number of predecessors. A trellis is a a group of 2 or more
predecessor blocks that all have the same successors.
because of this we can tail duplicate to extend existing trellises.
As an example consider the following CFG:
B D F H
/ \ / \ / \ / \
A---C---E---G---Ret
Where A,C,E,G are all small (Currently 2 instructions).
The CFG preserving layout is then A,B,C,D,E,F,G,H,Ret.
The current code will copy C into B, E into D and G into F and yield the layout
A,C,B(C),E,D(E),F(G),G,H,ret
define void @straight_test(i32 %tag) {
entry:
br label %test1
test1: ; A
%tagbit1 = and i32 %tag, 1
%tagbit1eq0 = icmp eq i32 %tagbit1, 0
br i1 %tagbit1eq0, label %test2, label %optional1
optional1: ; B
call void @a()
br label %test2
test2: ; C
%tagbit2 = and i32 %tag, 2
%tagbit2eq0 = icmp eq i32 %tagbit2, 0
br i1 %tagbit2eq0, label %test3, label %optional2
optional2: ; D
call void @b()
br label %test3
test3: ; E
%tagbit3 = and i32 %tag, 4
%tagbit3eq0 = icmp eq i32 %tagbit3, 0
br i1 %tagbit3eq0, label %test4, label %optional3
optional3: ; F
call void @c()
br label %test4
test4: ; G
%tagbit4 = and i32 %tag, 8
%tagbit4eq0 = icmp eq i32 %tagbit4, 0
br i1 %tagbit4eq0, label %exit, label %optional4
optional4: ; H
call void @d()
br label %exit
exit:
ret void
}
here is the layout after D27742:
straight_test: # @straight_test
; ... Prologue elided
; BB#0: # %entry ; A (merged with test1)
; ... More prologue elided
mr 30, 3
andi. 3, 30, 1
bc 12, 1, .LBB0_2
; BB#1: # %test2 ; C
rlwinm. 3, 30, 0, 30, 30
beq 0, .LBB0_3
b .LBB0_4
.LBB0_2: # %optional1 ; B (copy of C)
bl a
nop
rlwinm. 3, 30, 0, 30, 30
bne 0, .LBB0_4
.LBB0_3: # %test3 ; E
rlwinm. 3, 30, 0, 29, 29
beq 0, .LBB0_5
b .LBB0_6
.LBB0_4: # %optional2 ; D (copy of E)
bl b
nop
rlwinm. 3, 30, 0, 29, 29
bne 0, .LBB0_6
.LBB0_5: # %test4 ; G
rlwinm. 3, 30, 0, 28, 28
beq 0, .LBB0_8
b .LBB0_7
.LBB0_6: # %optional3 ; F (copy of G)
bl c
nop
rlwinm. 3, 30, 0, 28, 28
beq 0, .LBB0_8
.LBB0_7: # %optional4 ; H
bl d
nop
.LBB0_8: # %exit ; Ret
ld 30, 96(1) # 8-byte Folded Reload
addi 1, 1, 112
ld 0, 16(1)
mtlr 0
blr
The tail-duplication has produced some benefit, but it has also produced a
trellis which is not laid out optimally. With this patch, we improve the layouts
of such trellises, and decrease the cost calculation for tail-duplication
accordingly.
This patch produces the layout A,C,E,G,B,D,F,H,Ret. This layout does have
back edges, which is a negative, but it has a bigger compensating
positive, which is that it handles the case where there are long strings
of skipped blocks much better than the original layout. Both layouts
handle runs of executed blocks equally well. Branch prediction also
improves if there is any correlation between subsequent optional blocks.
Here is the resulting concrete layout:
straight_test: # @straight_test
; BB#0: # %entry ; A (merged with test1)
mr 30, 3
andi. 3, 30, 1
bc 12, 1, .LBB0_4
; BB#1: # %test2 ; C
rlwinm. 3, 30, 0, 30, 30
bne 0, .LBB0_5
.LBB0_2: # %test3 ; E
rlwinm. 3, 30, 0, 29, 29
bne 0, .LBB0_6
.LBB0_3: # %test4 ; G
rlwinm. 3, 30, 0, 28, 28
bne 0, .LBB0_7
b .LBB0_8
.LBB0_4: # %optional1 ; B (Copy of C)
bl a
nop
rlwinm. 3, 30, 0, 30, 30
beq 0, .LBB0_2
.LBB0_5: # %optional2 ; D (Copy of E)
bl b
nop
rlwinm. 3, 30, 0, 29, 29
beq 0, .LBB0_3
.LBB0_6: # %optional3 ; F (Copy of G)
bl c
nop
rlwinm. 3, 30, 0, 28, 28
beq 0, .LBB0_8
.LBB0_7: # %optional4 ; H
bl d
nop
.LBB0_8: # %exit
Differential Revision: https://reviews.llvm.org/D28522
llvm-svn: 295223
2017-02-16 03:49:14 +08:00
|
|
|
; X64-NEXT: .LBB1_4:
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; X64-NEXT: testl %r8d, %r8d
|
|
|
|
; X64-NEXT: jne .LBB1_8
|
|
|
|
; X64-NEXT: .LBB1_7:
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
|
2017-05-16 01:30:47 +08:00
|
|
|
; X64-NEXT: testl %esi, %esi
|
2017-05-19 02:50:05 +08:00
|
|
|
; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
2017-05-16 01:30:47 +08:00
|
|
|
; X64-NEXT: je .LBB1_10
|
|
|
|
; X64-NEXT: jmp .LBB1_11
|
2016-08-10 20:26:40 +08:00
|
|
|
; X64-NEXT: .LBB1_1:
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: testl %ecx, %ecx
|
2016-08-10 20:26:40 +08:00
|
|
|
; X64-NEXT: je .LBB1_4
|
Codegen: Tail-duplicate during placement.
The tail duplication pass uses an assumed layout when making duplication
decisions. This is fine, but passes up duplication opportunities that
may arise when blocks are outlined. Because we want the updated CFG to
affect subsequent placement decisions, this change must occur during
placement.
In order to achieve this goal, TailDuplicationPass is split into a
utility class, TailDuplicator, and the pass itself. The pass delegates
nearly everything to the TailDuplicator object, except for looping over
the blocks in a function. This allows the same code to be used for tail
duplication in both places.
This change, in concert with outlining optional branches, allows
triangle shaped code to perform much better, esepecially when the
taken/untaken branches are correlated, as it creates a second spine when
the tests are small enough.
Issue from previous rollback fixed, and a new test was added for that
case as well. Issue was worklist/scheduling/taildup issue in layout.
Issue from 2nd rollback fixed, with 2 additional tests. Issue was
tail merging/loop info/tail-duplication causing issue with loops that share
a header block.
Issue with early tail-duplication of blocks that branch to a fallthrough
predecessor fixed with test case: tail-dup-branch-to-fallthrough.ll
Differential revision: https://reviews.llvm.org/D18226
llvm-svn: 283934
2016-10-12 04:36:43 +08:00
|
|
|
; X64-NEXT: .LBB1_5: # %entry
|
2016-08-10 20:26:40 +08:00
|
|
|
; X64-NEXT: xorps %xmm2, %xmm2
|
Codegen: Tail-duplicate during placement.
The tail duplication pass uses an assumed layout when making duplication
decisions. This is fine, but passes up duplication opportunities that
may arise when blocks are outlined. Because we want the updated CFG to
affect subsequent placement decisions, this change must occur during
placement.
In order to achieve this goal, TailDuplicationPass is split into a
utility class, TailDuplicator, and the pass itself. The pass delegates
nearly everything to the TailDuplicator object, except for looping over
the blocks in a function. This allows the same code to be used for tail
duplication in both places.
This change, in concert with outlining optional branches, allows
triangle shaped code to perform much better, esepecially when the
taken/untaken branches are correlated, as it creates a second spine when
the tests are small enough.
Issue from previous rollback fixed, and a new test was added for that
case as well. Issue was worklist/scheduling/taildup issue in layout.
Issue from 2nd rollback fixed, with 2 additional tests. Issue was
tail merging/loop info/tail-duplication causing issue with loops that share
a header block.
Issue with early tail-duplication of blocks that branch to a fallthrough
predecessor fixed with test case: tail-dup-branch-to-fallthrough.ll
Differential revision: https://reviews.llvm.org/D18226
llvm-svn: 283934
2016-10-12 04:36:43 +08:00
|
|
|
; X64-NEXT: testl %r8d, %r8d
|
2016-08-10 20:26:40 +08:00
|
|
|
; X64-NEXT: je .LBB1_7
|
Codegen: Tail-duplicate during placement.
The tail duplication pass uses an assumed layout when making duplication
decisions. This is fine, but passes up duplication opportunities that
may arise when blocks are outlined. Because we want the updated CFG to
affect subsequent placement decisions, this change must occur during
placement.
In order to achieve this goal, TailDuplicationPass is split into a
utility class, TailDuplicator, and the pass itself. The pass delegates
nearly everything to the TailDuplicator object, except for looping over
the blocks in a function. This allows the same code to be used for tail
duplication in both places.
This change, in concert with outlining optional branches, allows
triangle shaped code to perform much better, esepecially when the
taken/untaken branches are correlated, as it creates a second spine when
the tests are small enough.
Issue from previous rollback fixed, and a new test was added for that
case as well. Issue was worklist/scheduling/taildup issue in layout.
Issue from 2nd rollback fixed, with 2 additional tests. Issue was
tail merging/loop info/tail-duplication causing issue with loops that share
a header block.
Issue with early tail-duplication of blocks that branch to a fallthrough
predecessor fixed with test case: tail-dup-branch-to-fallthrough.ll
Differential revision: https://reviews.llvm.org/D18226
llvm-svn: 283934
2016-10-12 04:36:43 +08:00
|
|
|
; X64-NEXT: .LBB1_8: # %entry
|
2016-08-10 20:26:40 +08:00
|
|
|
; X64-NEXT: xorps %xmm3, %xmm3
|
Add LiveRangeShrink pass to shrink live range within BB.
Summary: LiveRangeShrink pass moves instruction right after the definition with the same BB if the instruction and its operands all have more than one use. This pass is inexpensive and guarantees optimal live-range within BB.
Reviewers: davidxl, wmi, hfinkel, MatzeB, andreadb
Reviewed By: MatzeB, andreadb
Subscribers: hiraditya, jyknight, sanjoy, skatkov, gberry, jholewinski, qcolombet, javed.absar, krytarowski, atrick, spatel, RKSimon, andreadb, MatzeB, mehdi_amini, mgorny, efriedma, davide, dberlin, llvm-commits
Differential Revision: https://reviews.llvm.org/D32563
llvm-svn: 302938
2017-05-13 03:29:27 +08:00
|
|
|
; X64-NEXT: testl %esi, %esi
|
2017-05-19 02:50:05 +08:00
|
|
|
; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
2016-08-10 20:26:40 +08:00
|
|
|
; X64-NEXT: jne .LBB1_11
|
2017-05-16 01:30:47 +08:00
|
|
|
; X64-NEXT: .LBB1_10:
|
2016-08-10 20:26:40 +08:00
|
|
|
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X64-NEXT: .LBB1_11: # %entry
|
|
|
|
; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
2016-08-10 20:26:40 +08:00
|
|
|
; X64-NEXT: retq
|
2014-03-08 07:25:55 +08:00
|
|
|
entry:
|
|
|
|
%a1 = icmp eq <4 x i32> %q, zeroinitializer
|
|
|
|
%a14 = select <4 x i1> %a1, <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+0> , <4 x float> zeroinitializer
|
|
|
|
ret <4 x float> %a14
|
|
|
|
}
|
2016-06-11 02:05:55 +08:00
|
|
|
|
|
|
|
; v4i32 isn't legal for SSE1, but this should be cmpps.
|
|
|
|
|
|
|
|
define <4 x float> @PR28044(<4 x float> %a0, <4 x float> %a1) nounwind {
|
2018-06-03 04:25:56 +08:00
|
|
|
; CHECK-LABEL: PR28044:
|
|
|
|
; CHECK: # %bb.0:
|
|
|
|
; CHECK-NEXT: cmpeqps %xmm1, %xmm0
|
|
|
|
; CHECK-NEXT: ret{{[l|q]}}
|
2016-06-11 02:05:55 +08:00
|
|
|
%cmp = fcmp oeq <4 x float> %a0, %a1
|
|
|
|
%sext = sext <4 x i1> %cmp to <4 x i32>
|
|
|
|
%res = bitcast <4 x i32> %sext to <4 x float>
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
2016-09-25 04:24:06 +08:00
|
|
|
; Don't crash trying to do the impossible: an integer vector comparison doesn't exist, so we must scalarize.
|
|
|
|
; https://llvm.org/bugs/show_bug.cgi?id=30512
|
|
|
|
|
|
|
|
define <4 x i32> @PR30512(<4 x i32> %x, <4 x i32> %y) nounwind {
|
2018-06-03 04:25:56 +08:00
|
|
|
; X86-LABEL: PR30512:
|
|
|
|
; X86: # %bb.0:
|
|
|
|
; X86-NEXT: pushl %ebx
|
|
|
|
; X86-NEXT: pushl %edi
|
|
|
|
; X86-NEXT: pushl %esi
|
|
|
|
; X86-NEXT: subl $16, %esp
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
|
|
|
|
; X86-NEXT: xorl %ebx, %ebx
|
|
|
|
; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi
|
|
|
|
; X86-NEXT: sete %bl
|
|
|
|
; X86-NEXT: negl %ebx
|
|
|
|
; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: xorl %ebx, %ebx
|
|
|
|
; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi
|
|
|
|
; X86-NEXT: sete %bl
|
|
|
|
; X86-NEXT: negl %ebx
|
|
|
|
; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: xorl %ebx, %ebx
|
|
|
|
; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx
|
|
|
|
; X86-NEXT: sete %bl
|
|
|
|
; X86-NEXT: negl %ebx
|
|
|
|
; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: xorl %edx, %edx
|
|
|
|
; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X86-NEXT: sete %dl
|
|
|
|
; X86-NEXT: negl %edx
|
|
|
|
; X86-NEXT: movl %edx, (%esp)
|
|
|
|
; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X86-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; X86-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
|
|
|
; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X86-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; X86-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
|
|
|
|
; X86-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
|
|
|
|
; X86-NEXT: andps {{\.LCPI.*}}, %xmm2
|
|
|
|
; X86-NEXT: movaps %xmm2, (%eax)
|
|
|
|
; X86-NEXT: addl $16, %esp
|
|
|
|
; X86-NEXT: popl %esi
|
|
|
|
; X86-NEXT: popl %edi
|
|
|
|
; X86-NEXT: popl %ebx
|
|
|
|
; X86-NEXT: retl $4
|
2016-09-25 04:24:06 +08:00
|
|
|
;
|
|
|
|
; X64-LABEL: PR30512:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0:
|
2016-09-25 04:24:06 +08:00
|
|
|
; X64-NEXT: xorl %eax, %eax
|
2017-11-25 15:20:21 +08:00
|
|
|
; X64-NEXT: cmpl {{[0-9]+}}(%rsp), %r8d
|
2016-09-25 04:24:06 +08:00
|
|
|
; X64-NEXT: sete %al
|
2017-11-25 15:20:21 +08:00
|
|
|
; X64-NEXT: negl %eax
|
|
|
|
; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: xorl %eax, %eax
|
2016-09-25 04:24:06 +08:00
|
|
|
; X64-NEXT: cmpl {{[0-9]+}}(%rsp), %ecx
|
2017-11-25 15:20:21 +08:00
|
|
|
; X64-NEXT: sete %al
|
|
|
|
; X64-NEXT: negl %eax
|
|
|
|
; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: xorl %eax, %eax
|
|
|
|
; X64-NEXT: cmpl {{[0-9]+}}(%rsp), %edx
|
|
|
|
; X64-NEXT: sete %al
|
|
|
|
; X64-NEXT: negl %eax
|
|
|
|
; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: xorl %eax, %eax
|
|
|
|
; X64-NEXT: cmpl %r9d, %esi
|
|
|
|
; X64-NEXT: sete %al
|
|
|
|
; X64-NEXT: negl %eax
|
|
|
|
; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
|
|
|
|
; X64-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
|
|
|
|
; X64-NEXT: andps {{.*}}(%rip), %xmm2
|
|
|
|
; X64-NEXT: movaps %xmm2, (%rdi)
|
2016-09-25 04:24:06 +08:00
|
|
|
; X64-NEXT: movq %rdi, %rax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%cmp = icmp eq <4 x i32> %x, %y
|
|
|
|
%zext = zext <4 x i1> %cmp to <4 x i32>
|
|
|
|
ret <4 x i32> %zext
|
|
|
|
}
|
|
|
|
|
Codegen: Make chains from trellis-shaped CFGs
Lay out trellis-shaped CFGs optimally.
A trellis of the shape below:
A B
|\ /|
| \ / |
| X |
| / \ |
|/ \|
C D
would be laid out A; B->C ; D by the current layout algorithm. Now we identify
trellises and lay them out either A->C; B->D or A->D; B->C. This scales with an
increasing number of predecessors. A trellis is a a group of 2 or more
predecessor blocks that all have the same successors.
because of this we can tail duplicate to extend existing trellises.
As an example consider the following CFG:
B D F H
/ \ / \ / \ / \
A---C---E---G---Ret
Where A,C,E,G are all small (Currently 2 instructions).
The CFG preserving layout is then A,B,C,D,E,F,G,H,Ret.
The current code will copy C into B, E into D and G into F and yield the layout
A,C,B(C),E,D(E),F(G),G,H,ret
define void @straight_test(i32 %tag) {
entry:
br label %test1
test1: ; A
%tagbit1 = and i32 %tag, 1
%tagbit1eq0 = icmp eq i32 %tagbit1, 0
br i1 %tagbit1eq0, label %test2, label %optional1
optional1: ; B
call void @a()
br label %test2
test2: ; C
%tagbit2 = and i32 %tag, 2
%tagbit2eq0 = icmp eq i32 %tagbit2, 0
br i1 %tagbit2eq0, label %test3, label %optional2
optional2: ; D
call void @b()
br label %test3
test3: ; E
%tagbit3 = and i32 %tag, 4
%tagbit3eq0 = icmp eq i32 %tagbit3, 0
br i1 %tagbit3eq0, label %test4, label %optional3
optional3: ; F
call void @c()
br label %test4
test4: ; G
%tagbit4 = and i32 %tag, 8
%tagbit4eq0 = icmp eq i32 %tagbit4, 0
br i1 %tagbit4eq0, label %exit, label %optional4
optional4: ; H
call void @d()
br label %exit
exit:
ret void
}
here is the layout after D27742:
straight_test: # @straight_test
; ... Prologue elided
; BB#0: # %entry ; A (merged with test1)
; ... More prologue elided
mr 30, 3
andi. 3, 30, 1
bc 12, 1, .LBB0_2
; BB#1: # %test2 ; C
rlwinm. 3, 30, 0, 30, 30
beq 0, .LBB0_3
b .LBB0_4
.LBB0_2: # %optional1 ; B (copy of C)
bl a
nop
rlwinm. 3, 30, 0, 30, 30
bne 0, .LBB0_4
.LBB0_3: # %test3 ; E
rlwinm. 3, 30, 0, 29, 29
beq 0, .LBB0_5
b .LBB0_6
.LBB0_4: # %optional2 ; D (copy of E)
bl b
nop
rlwinm. 3, 30, 0, 29, 29
bne 0, .LBB0_6
.LBB0_5: # %test4 ; G
rlwinm. 3, 30, 0, 28, 28
beq 0, .LBB0_8
b .LBB0_7
.LBB0_6: # %optional3 ; F (copy of G)
bl c
nop
rlwinm. 3, 30, 0, 28, 28
beq 0, .LBB0_8
.LBB0_7: # %optional4 ; H
bl d
nop
.LBB0_8: # %exit ; Ret
ld 30, 96(1) # 8-byte Folded Reload
addi 1, 1, 112
ld 0, 16(1)
mtlr 0
blr
The tail-duplication has produced some benefit, but it has also produced a
trellis which is not laid out optimally. With this patch, we improve the layouts
of such trellises, and decrease the cost calculation for tail-duplication
accordingly.
This patch produces the layout A,C,E,G,B,D,F,H,Ret. This layout does have
back edges, which is a negative, but it has a bigger compensating
positive, which is that it handles the case where there are long strings
of skipped blocks much better than the original layout. Both layouts
handle runs of executed blocks equally well. Branch prediction also
improves if there is any correlation between subsequent optional blocks.
Here is the resulting concrete layout:
straight_test: # @straight_test
; BB#0: # %entry ; A (merged with test1)
mr 30, 3
andi. 3, 30, 1
bc 12, 1, .LBB0_4
; BB#1: # %test2 ; C
rlwinm. 3, 30, 0, 30, 30
bne 0, .LBB0_5
.LBB0_2: # %test3 ; E
rlwinm. 3, 30, 0, 29, 29
bne 0, .LBB0_6
.LBB0_3: # %test4 ; G
rlwinm. 3, 30, 0, 28, 28
bne 0, .LBB0_7
b .LBB0_8
.LBB0_4: # %optional1 ; B (Copy of C)
bl a
nop
rlwinm. 3, 30, 0, 30, 30
beq 0, .LBB0_2
.LBB0_5: # %optional2 ; D (Copy of E)
bl b
nop
rlwinm. 3, 30, 0, 29, 29
beq 0, .LBB0_3
.LBB0_6: # %optional3 ; F (Copy of G)
bl c
nop
rlwinm. 3, 30, 0, 28, 28
beq 0, .LBB0_8
.LBB0_7: # %optional4 ; H
bl d
nop
.LBB0_8: # %exit
Differential Revision: https://reviews.llvm.org/D28522
llvm-svn: 295223
2017-02-16 03:49:14 +08:00
|
|
|
; Fragile test warning - we need to induce the generation of a vselect
|
2017-01-23 01:06:12 +08:00
|
|
|
; post-legalization to cause the crash seen in:
|
|
|
|
; https://llvm.org/bugs/show_bug.cgi?id=31672
|
|
|
|
; Is there a way to do that without an unsafe/fast sqrt intrinsic call?
|
2017-11-25 03:57:48 +08:00
|
|
|
;
|
|
|
|
; We now no longer try to lower sqrt using rsqrt with SSE1 only as the
|
|
|
|
; v4i32 vselect mentioned above should never have been created. We ended up
|
|
|
|
; scalarizing it anyway.
|
2017-01-23 01:06:12 +08:00
|
|
|
|
|
|
|
define <2 x float> @PR31672() #0 {
|
2018-06-03 04:25:56 +08:00
|
|
|
; X86-LABEL: PR31672:
|
|
|
|
; X86: # %bb.0:
|
|
|
|
; X86-NEXT: sqrtps {{\.LCPI.*}}, %xmm0
|
|
|
|
; X86-NEXT: retl
|
2017-01-23 01:06:12 +08:00
|
|
|
;
|
|
|
|
; X64-LABEL: PR31672:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0:
|
2017-11-25 03:57:48 +08:00
|
|
|
; X64-NEXT: sqrtps {{.*}}(%rip), %xmm0
|
2017-01-23 01:06:12 +08:00
|
|
|
; X64-NEXT: retq
|
|
|
|
%t0 = call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> <float 42.0, float 3.0>)
|
|
|
|
ret <2 x float> %t0
|
|
|
|
}
|
|
|
|
|
|
|
|
declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) #1
|
|
|
|
|
|
|
|
attributes #0 = { nounwind "unsafe-fp-math"="true" }
|
|
|
|
|