2016-02-02 05:06:32 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
2014-11-24 20:23:15 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
|
2014-10-02 14:52:19 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
|
2011-09-09 05:05:43 +08:00
|
|
|
|
2011-09-10 04:29:17 +08:00
|
|
|
; AVX128 tests:
|
|
|
|
|
2011-09-09 05:05:43 +08:00
|
|
|
define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-LABEL: vsel_float:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE2: # BB#0: # %entry
|
[x86,sdag] Two interrelated changes to the x86 and sdag code.
First, don't combine bit masking into vector shuffles (even ones the
target can handle) once operation legalization has taken place. Custom
legalization of vector shuffles may exist for these patterns (making the
predicate return true) but that custom legalization may in some cases
produce the exact bit math this matches. We only really want to handle
this prior to operation legalization.
However, the x86 backend, in a fit of awesome, relied on this. What it
would do is mark VSELECTs as expand, which would turn them into
arithmetic, which this would then match back into vector shuffles, which
we would then lower properly. Amazing.
Instead, the second change is to teach the x86 backend to directly form
vector shuffles from VSELECT nodes with constant conditions, and to mark
all of the vector types we support lowering blends as shuffles as custom
VSELECT lowering. We still mark the forms which actually support
variable blends as *legal* so that the custom lowering is bypassed, and
the legal lowering can even be used by the vector shuffle legalization
(yes, i know, this is confusing. but that's how the patterns are
written).
This makes the VSELECT lowering much more sensible, and in fact should
fix a bunch of bugs with it. However, as you'll see in the test cases,
right now what it does is point out the *hilarious* deficiency of the
new vector shuffle lowering when it comes to blends. Fortunately, my
very next patch fixes that. I can't submit it yet, because that patch,
somewhat obviously, forms the exact and/or pattern that the DAG combine
is matching here! Without this patch, teaching the vector shuffle
lowering to produce the right code infloops in the DAG combiner. With
this patch alone, we produce terrible code but at least lower through
the right paths. With both patches, all the regressions here should be
fixed, and a bunch of the improvements (like using 2 shufps with no
memory loads instead of 2 andps with memory loads and an orps) will
stay. Win!
There is one other change worth noting here. We had hilariously wrong
vectorization cost estimates for vselect because we fell through to the
code path that assumed all "expand" vector operations are scalarized.
However, the "expand" lowering of VSELECT is vector bit math, most
definitely not scalarized. So now we go back to the correct if horribly
naive cost of "1" for "not scalarized". If anyone wants to add actual
modeling of shuffle costs, that would be cool, but this seems an
improvement on its own. Note the removal of 16 and 32 "costs" for doing
a blend. Even in SSE2 we can blend in fewer than 16 instructions. ;] Of
course, we don't right now because of OMG bad code, but I'm going to fix
that. Next patch. I promise.
llvm-svn: 229835
2015-02-19 18:36:19 +08:00
|
|
|
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
|
|
|
|
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: vsel_float:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSSE3: # BB#0: # %entry
|
[x86,sdag] Two interrelated changes to the x86 and sdag code.
First, don't combine bit masking into vector shuffles (even ones the
target can handle) once operation legalization has taken place. Custom
legalization of vector shuffles may exist for these patterns (making the
predicate return true) but that custom legalization may in some cases
produce the exact bit math this matches. We only really want to handle
this prior to operation legalization.
However, the x86 backend, in a fit of awesome, relied on this. What it
would do is mark VSELECTs as expand, which would turn them into
arithmetic, which this would then match back into vector shuffles, which
we would then lower properly. Amazing.
Instead, the second change is to teach the x86 backend to directly form
vector shuffles from VSELECT nodes with constant conditions, and to mark
all of the vector types we support lowering blends as shuffles as custom
VSELECT lowering. We still mark the forms which actually support
variable blends as *legal* so that the custom lowering is bypassed, and
the legal lowering can even be used by the vector shuffle legalization
(yes, i know, this is confusing. but that's how the patterns are
written).
This makes the VSELECT lowering much more sensible, and in fact should
fix a bunch of bugs with it. However, as you'll see in the test cases,
right now what it does is point out the *hilarious* deficiency of the
new vector shuffle lowering when it comes to blends. Fortunately, my
very next patch fixes that. I can't submit it yet, because that patch,
somewhat obviously, forms the exact and/or pattern that the DAG combine
is matching here! Without this patch, teaching the vector shuffle
lowering to produce the right code infloops in the DAG combiner. With
this patch alone, we produce terrible code but at least lower through
the right paths. With both patches, all the regressions here should be
fixed, and a bunch of the improvements (like using 2 shufps with no
memory loads instead of 2 andps with memory loads and an orps) will
stay. Win!
There is one other change worth noting here. We had hilariously wrong
vectorization cost estimates for vselect because we fell through to the
code path that assumed all "expand" vector operations are scalarized.
However, the "expand" lowering of VSELECT is vector bit math, most
definitely not scalarized. So now we go back to the correct if horribly
naive cost of "1" for "not scalarized". If anyone wants to add actual
modeling of shuffle costs, that would be cool, but this seems an
improvement on its own. Note the removal of 16 and 32 "costs" for doing
a blend. Even in SSE2 we can blend in fewer than 16 instructions. ;] Of
course, we don't right now because of OMG bad code, but I'm going to fix
that. Next patch. I promise.
llvm-svn: 229835
2015-02-19 18:36:19 +08:00
|
|
|
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
|
|
|
|
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: vsel_float:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE41: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: vsel_float:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
|
|
|
|
; AVX-NEXT: retq
|
2014-10-02 14:52:19 +08:00
|
|
|
entry:
|
[X86] Teach how to combine a vselect into a movss/movsd
Add target specific rules for combining vselect dag nodes into movss/movsd
when possible.
If the vector type of the vselect dag node in input is either MVT::v4i13 or
MVT::v4f32, then try to fold according to rules:
1) fold (vselect (build_vector (0, -1, -1, -1)), A, B) -> (movss A, B)
2) fold (vselect (build_vector (-1, 0, 0, 0)), A, B) -> (movss B, A)
If the vector type of the vselect dag node in input is either MVT::v2i64 or
MVT::v2f64 (and we have SSE2), then try to fold according to rules:
3) fold (vselect (build_vector (0, -1)), A, B) -> (movsd A, B)
4) fold (vselect (build_vector (-1, 0)), A, B) -> (movsd B, A)
llvm-svn: 199683
2014-01-21 03:35:22 +08:00
|
|
|
%vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %v1, <4 x float> %v2
|
2011-09-09 05:05:43 +08:00
|
|
|
ret <4 x float> %vsel
|
|
|
|
}
|
|
|
|
|
2014-10-02 04:56:57 +08:00
|
|
|
define <4 x float> @vsel_float2(<4 x float> %v1, <4 x float> %v2) {
|
2014-11-24 20:23:15 +08:00
|
|
|
; SSE2-LABEL: vsel_float2:
|
|
|
|
; SSE2: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
|
2014-11-24 20:23:15 +08:00
|
|
|
; SSE2-NEXT: movaps %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: vsel_float2:
|
|
|
|
; SSSE3: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
|
2014-11-24 20:23:15 +08:00
|
|
|
; SSSE3-NEXT: movaps %xmm1, %xmm0
|
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: vsel_float2:
|
|
|
|
; SSE41: # BB#0: # %entry
|
|
|
|
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
|
|
|
|
; SSE41-NEXT: retq
|
2014-10-02 05:03:21 +08:00
|
|
|
;
|
|
|
|
; AVX-LABEL: vsel_float2:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
|
2014-10-02 05:03:21 +08:00
|
|
|
; AVX-NEXT: retq
|
2014-10-02 14:52:19 +08:00
|
|
|
entry:
|
2014-10-02 04:56:57 +08:00
|
|
|
%vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2
|
|
|
|
ret <4 x float> %vsel
|
|
|
|
}
|
|
|
|
|
2014-10-02 05:07:07 +08:00
|
|
|
define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
|
|
|
|
; SSE2-LABEL: vsel_4xi8:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE2: # BB#0: # %entry
|
[x86,sdag] Two interrelated changes to the x86 and sdag code.
First, don't combine bit masking into vector shuffles (even ones the
target can handle) once operation legalization has taken place. Custom
legalization of vector shuffles may exist for these patterns (making the
predicate return true) but that custom legalization may in some cases
produce the exact bit math this matches. We only really want to handle
this prior to operation legalization.
However, the x86 backend, in a fit of awesome, relied on this. What it
would do is mark VSELECTs as expand, which would turn them into
arithmetic, which this would then match back into vector shuffles, which
we would then lower properly. Amazing.
Instead, the second change is to teach the x86 backend to directly form
vector shuffles from VSELECT nodes with constant conditions, and to mark
all of the vector types we support lowering blends as shuffles as custom
VSELECT lowering. We still mark the forms which actually support
variable blends as *legal* so that the custom lowering is bypassed, and
the legal lowering can even be used by the vector shuffle legalization
(yes, i know, this is confusing. but that's how the patterns are
written).
This makes the VSELECT lowering much more sensible, and in fact should
fix a bunch of bugs with it. However, as you'll see in the test cases,
right now what it does is point out the *hilarious* deficiency of the
new vector shuffle lowering when it comes to blends. Fortunately, my
very next patch fixes that. I can't submit it yet, because that patch,
somewhat obviously, forms the exact and/or pattern that the DAG combine
is matching here! Without this patch, teaching the vector shuffle
lowering to produce the right code infloops in the DAG combiner. With
this patch alone, we produce terrible code but at least lower through
the right paths. With both patches, all the regressions here should be
fixed, and a bunch of the improvements (like using 2 shufps with no
memory loads instead of 2 andps with memory loads and an orps) will
stay. Win!
There is one other change worth noting here. We had hilariously wrong
vectorization cost estimates for vselect because we fell through to the
code path that assumed all "expand" vector operations are scalarized.
However, the "expand" lowering of VSELECT is vector bit math, most
definitely not scalarized. So now we go back to the correct if horribly
naive cost of "1" for "not scalarized". If anyone wants to add actual
modeling of shuffle costs, that would be cool, but this seems an
improvement on its own. Note the removal of 16 and 32 "costs" for doing
a blend. Even in SSE2 we can blend in fewer than 16 instructions. ;] Of
course, we don't right now because of OMG bad code, but I'm going to fix
that. Next patch. I promise.
llvm-svn: 229835
2015-02-19 18:36:19 +08:00
|
|
|
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
|
|
|
|
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
|
2014-10-02 05:07:07 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: vsel_4xi8:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSSE3: # BB#0: # %entry
|
[x86,sdag] Two interrelated changes to the x86 and sdag code.
First, don't combine bit masking into vector shuffles (even ones the
target can handle) once operation legalization has taken place. Custom
legalization of vector shuffles may exist for these patterns (making the
predicate return true) but that custom legalization may in some cases
produce the exact bit math this matches. We only really want to handle
this prior to operation legalization.
However, the x86 backend, in a fit of awesome, relied on this. What it
would do is mark VSELECTs as expand, which would turn them into
arithmetic, which this would then match back into vector shuffles, which
we would then lower properly. Amazing.
Instead, the second change is to teach the x86 backend to directly form
vector shuffles from VSELECT nodes with constant conditions, and to mark
all of the vector types we support lowering blends as shuffles as custom
VSELECT lowering. We still mark the forms which actually support
variable blends as *legal* so that the custom lowering is bypassed, and
the legal lowering can even be used by the vector shuffle legalization
(yes, i know, this is confusing. but that's how the patterns are
written).
This makes the VSELECT lowering much more sensible, and in fact should
fix a bunch of bugs with it. However, as you'll see in the test cases,
right now what it does is point out the *hilarious* deficiency of the
new vector shuffle lowering when it comes to blends. Fortunately, my
very next patch fixes that. I can't submit it yet, because that patch,
somewhat obviously, forms the exact and/or pattern that the DAG combine
is matching here! Without this patch, teaching the vector shuffle
lowering to produce the right code infloops in the DAG combiner. With
this patch alone, we produce terrible code but at least lower through
the right paths. With both patches, all the regressions here should be
fixed, and a bunch of the improvements (like using 2 shufps with no
memory loads instead of 2 andps with memory loads and an orps) will
stay. Win!
There is one other change worth noting here. We had hilariously wrong
vectorization cost estimates for vselect because we fell through to the
code path that assumed all "expand" vector operations are scalarized.
However, the "expand" lowering of VSELECT is vector bit math, most
definitely not scalarized. So now we go back to the correct if horribly
naive cost of "1" for "not scalarized". If anyone wants to add actual
modeling of shuffle costs, that would be cool, but this seems an
improvement on its own. Note the removal of 16 and 32 "costs" for doing
a blend. Even in SSE2 we can blend in fewer than 16 instructions. ;] Of
course, we don't right now because of OMG bad code, but I'm going to fix
that. Next patch. I promise.
llvm-svn: 229835
2015-02-19 18:36:19 +08:00
|
|
|
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
|
|
|
|
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
|
2014-10-02 05:07:07 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: vsel_4xi8:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE41: # BB#0: # %entry
|
[x86] Enable the new vector shuffle lowering by default.
Update the entire regression test suite for the new shuffles. Remove
most of the old testing which was devoted to the old shuffle lowering
path and is no longer relevant really. Also remove a few other random
tests that only really exercised shuffles and only incidently or without
any interesting aspects to them.
Benchmarking that I have done shows a few small regressions with this on
LNT, zero measurable regressions on real, large applications, and for
several benchmarks where the loop vectorizer fires in the hot path it
shows 5% to 40% improvements for SSE2 and SSE3 code running on Sandy
Bridge machines. Running on AMD machines shows even more dramatic
improvements.
When using newer ISA vector extensions the gains are much more modest,
but the code is still better on the whole. There are a few regressions
being tracked (PR21137, PR21138, PR21139) but by and large this is
expected to be a win for x86 generated code performance.
It is also more correct than the code it replaces. I have fuzz tested
this extensively with ISA extensions up through AVX2 and found no
crashes or miscompiles (yet...). The old lowering had a few miscompiles
and crashers after a somewhat smaller amount of fuzz testing.
There is one significant area where the new code path lags behind and
that is in AVX-512 support. However, there was *extremely little*
support for that already and so this isn't a significant step backwards
and the new framework will probably make it easier to implement lowering
that uses the full power of AVX-512's table-based shuffle+blend (IMO).
Many thanks to Quentin, Andrea, Robert, and others for benchmarking
assistance. Thanks to Adam and others for help with AVX-512. Thanks to
Hal, Eric, and *many* others for answering my incessant questions about
how the backend actually works. =]
I will leave the old code path in the tree until the 3 PRs above are at
least resolved to folks' satisfaction. Then I will rip it (and 1000s of
lines of code) out. =] I don't expect this flag to stay around for very
long. It may not survive next week.
llvm-svn: 219046
2014-10-04 11:52:55 +08:00
|
|
|
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
|
2014-10-02 05:07:07 +08:00
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: vsel_4xi8:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX1: # BB#0: # %entry
|
[x86] Enable the new vector shuffle lowering by default.
Update the entire regression test suite for the new shuffles. Remove
most of the old testing which was devoted to the old shuffle lowering
path and is no longer relevant really. Also remove a few other random
tests that only really exercised shuffles and only incidently or without
any interesting aspects to them.
Benchmarking that I have done shows a few small regressions with this on
LNT, zero measurable regressions on real, large applications, and for
several benchmarks where the loop vectorizer fires in the hot path it
shows 5% to 40% improvements for SSE2 and SSE3 code running on Sandy
Bridge machines. Running on AMD machines shows even more dramatic
improvements.
When using newer ISA vector extensions the gains are much more modest,
but the code is still better on the whole. There are a few regressions
being tracked (PR21137, PR21138, PR21139) but by and large this is
expected to be a win for x86 generated code performance.
It is also more correct than the code it replaces. I have fuzz tested
this extensively with ISA extensions up through AVX2 and found no
crashes or miscompiles (yet...). The old lowering had a few miscompiles
and crashers after a somewhat smaller amount of fuzz testing.
There is one significant area where the new code path lags behind and
that is in AVX-512 support. However, there was *extremely little*
support for that already and so this isn't a significant step backwards
and the new framework will probably make it easier to implement lowering
that uses the full power of AVX-512's table-based shuffle+blend (IMO).
Many thanks to Quentin, Andrea, Robert, and others for benchmarking
assistance. Thanks to Adam and others for help with AVX-512. Thanks to
Hal, Eric, and *many* others for answering my incessant questions about
how the backend actually works. =]
I will leave the old code path in the tree until the 3 PRs above are at
least resolved to folks' satisfaction. Then I will rip it (and 1000s of
lines of code) out. =] I don't expect this flag to stay around for very
long. It may not survive next week.
llvm-svn: 219046
2014-10-04 11:52:55 +08:00
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
|
2014-10-02 05:07:07 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: vsel_4xi8:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX2: # BB#0: # %entry
|
2014-10-02 05:07:07 +08:00
|
|
|
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
|
|
|
|
; AVX2-NEXT: retq
|
2014-10-02 14:52:19 +08:00
|
|
|
entry:
|
2014-10-02 05:07:07 +08:00
|
|
|
%vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
|
|
|
|
ret <4 x i8> %vsel
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
|
|
|
|
; SSE2-LABEL: vsel_4xi16:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE2: # BB#0: # %entry
|
[x86,sdag] Two interrelated changes to the x86 and sdag code.
First, don't combine bit masking into vector shuffles (even ones the
target can handle) once operation legalization has taken place. Custom
legalization of vector shuffles may exist for these patterns (making the
predicate return true) but that custom legalization may in some cases
produce the exact bit math this matches. We only really want to handle
this prior to operation legalization.
However, the x86 backend, in a fit of awesome, relied on this. What it
would do is mark VSELECTs as expand, which would turn them into
arithmetic, which this would then match back into vector shuffles, which
we would then lower properly. Amazing.
Instead, the second change is to teach the x86 backend to directly form
vector shuffles from VSELECT nodes with constant conditions, and to mark
all of the vector types we support lowering blends as shuffles as custom
VSELECT lowering. We still mark the forms which actually support
variable blends as *legal* so that the custom lowering is bypassed, and
the legal lowering can even be used by the vector shuffle legalization
(yes, i know, this is confusing. but that's how the patterns are
written).
This makes the VSELECT lowering much more sensible, and in fact should
fix a bunch of bugs with it. However, as you'll see in the test cases,
right now what it does is point out the *hilarious* deficiency of the
new vector shuffle lowering when it comes to blends. Fortunately, my
very next patch fixes that. I can't submit it yet, because that patch,
somewhat obviously, forms the exact and/or pattern that the DAG combine
is matching here! Without this patch, teaching the vector shuffle
lowering to produce the right code infloops in the DAG combiner. With
this patch alone, we produce terrible code but at least lower through
the right paths. With both patches, all the regressions here should be
fixed, and a bunch of the improvements (like using 2 shufps with no
memory loads instead of 2 andps with memory loads and an orps) will
stay. Win!
There is one other change worth noting here. We had hilariously wrong
vectorization cost estimates for vselect because we fell through to the
code path that assumed all "expand" vector operations are scalarized.
However, the "expand" lowering of VSELECT is vector bit math, most
definitely not scalarized. So now we go back to the correct if horribly
naive cost of "1" for "not scalarized". If anyone wants to add actual
modeling of shuffle costs, that would be cool, but this seems an
improvement on its own. Note the removal of 16 and 32 "costs" for doing
a blend. Even in SSE2 we can blend in fewer than 16 instructions. ;] Of
course, we don't right now because of OMG bad code, but I'm going to fix
that. Next patch. I promise.
llvm-svn: 229835
2015-02-19 18:36:19 +08:00
|
|
|
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
|
|
|
|
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
|
|
|
|
; SSE2-NEXT: movaps %xmm1, %xmm0
|
2014-10-02 05:07:07 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: vsel_4xi16:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSSE3: # BB#0: # %entry
|
[x86,sdag] Two interrelated changes to the x86 and sdag code.
First, don't combine bit masking into vector shuffles (even ones the
target can handle) once operation legalization has taken place. Custom
legalization of vector shuffles may exist for these patterns (making the
predicate return true) but that custom legalization may in some cases
produce the exact bit math this matches. We only really want to handle
this prior to operation legalization.
However, the x86 backend, in a fit of awesome, relied on this. What it
would do is mark VSELECTs as expand, which would turn them into
arithmetic, which this would then match back into vector shuffles, which
we would then lower properly. Amazing.
Instead, the second change is to teach the x86 backend to directly form
vector shuffles from VSELECT nodes with constant conditions, and to mark
all of the vector types we support lowering blends as shuffles as custom
VSELECT lowering. We still mark the forms which actually support
variable blends as *legal* so that the custom lowering is bypassed, and
the legal lowering can even be used by the vector shuffle legalization
(yes, i know, this is confusing. but that's how the patterns are
written).
This makes the VSELECT lowering much more sensible, and in fact should
fix a bunch of bugs with it. However, as you'll see in the test cases,
right now what it does is point out the *hilarious* deficiency of the
new vector shuffle lowering when it comes to blends. Fortunately, my
very next patch fixes that. I can't submit it yet, because that patch,
somewhat obviously, forms the exact and/or pattern that the DAG combine
is matching here! Without this patch, teaching the vector shuffle
lowering to produce the right code infloops in the DAG combiner. With
this patch alone, we produce terrible code but at least lower through
the right paths. With both patches, all the regressions here should be
fixed, and a bunch of the improvements (like using 2 shufps with no
memory loads instead of 2 andps with memory loads and an orps) will
stay. Win!
There is one other change worth noting here. We had hilariously wrong
vectorization cost estimates for vselect because we fell through to the
code path that assumed all "expand" vector operations are scalarized.
However, the "expand" lowering of VSELECT is vector bit math, most
definitely not scalarized. So now we go back to the correct if horribly
naive cost of "1" for "not scalarized". If anyone wants to add actual
modeling of shuffle costs, that would be cool, but this seems an
improvement on its own. Note the removal of 16 and 32 "costs" for doing
a blend. Even in SSE2 we can blend in fewer than 16 instructions. ;] Of
course, we don't right now because of OMG bad code, but I'm going to fix
that. Next patch. I promise.
llvm-svn: 229835
2015-02-19 18:36:19 +08:00
|
|
|
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
|
|
|
|
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
|
|
|
|
; SSSE3-NEXT: movaps %xmm1, %xmm0
|
2014-10-02 05:07:07 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: vsel_4xi16:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE41: # BB#0: # %entry
|
[x86] Enable the new vector shuffle lowering by default.
Update the entire regression test suite for the new shuffles. Remove
most of the old testing which was devoted to the old shuffle lowering
path and is no longer relevant really. Also remove a few other random
tests that only really exercised shuffles and only incidently or without
any interesting aspects to them.
Benchmarking that I have done shows a few small regressions with this on
LNT, zero measurable regressions on real, large applications, and for
several benchmarks where the loop vectorizer fires in the hot path it
shows 5% to 40% improvements for SSE2 and SSE3 code running on Sandy
Bridge machines. Running on AMD machines shows even more dramatic
improvements.
When using newer ISA vector extensions the gains are much more modest,
but the code is still better on the whole. There are a few regressions
being tracked (PR21137, PR21138, PR21139) but by and large this is
expected to be a win for x86 generated code performance.
It is also more correct than the code it replaces. I have fuzz tested
this extensively with ISA extensions up through AVX2 and found no
crashes or miscompiles (yet...). The old lowering had a few miscompiles
and crashers after a somewhat smaller amount of fuzz testing.
There is one significant area where the new code path lags behind and
that is in AVX-512 support. However, there was *extremely little*
support for that already and so this isn't a significant step backwards
and the new framework will probably make it easier to implement lowering
that uses the full power of AVX-512's table-based shuffle+blend (IMO).
Many thanks to Quentin, Andrea, Robert, and others for benchmarking
assistance. Thanks to Adam and others for help with AVX-512. Thanks to
Hal, Eric, and *many* others for answering my incessant questions about
how the backend actually works. =]
I will leave the old code path in the tree until the 3 PRs above are at
least resolved to folks' satisfaction. Then I will rip it (and 1000s of
lines of code) out. =] I don't expect this flag to stay around for very
long. It may not survive next week.
llvm-svn: 219046
2014-10-04 11:52:55 +08:00
|
|
|
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
|
2014-10-02 05:07:07 +08:00
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: vsel_4xi16:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX1: # BB#0: # %entry
|
[x86] Enable the new vector shuffle lowering by default.
Update the entire regression test suite for the new shuffles. Remove
most of the old testing which was devoted to the old shuffle lowering
path and is no longer relevant really. Also remove a few other random
tests that only really exercised shuffles and only incidently or without
any interesting aspects to them.
Benchmarking that I have done shows a few small regressions with this on
LNT, zero measurable regressions on real, large applications, and for
several benchmarks where the loop vectorizer fires in the hot path it
shows 5% to 40% improvements for SSE2 and SSE3 code running on Sandy
Bridge machines. Running on AMD machines shows even more dramatic
improvements.
When using newer ISA vector extensions the gains are much more modest,
but the code is still better on the whole. There are a few regressions
being tracked (PR21137, PR21138, PR21139) but by and large this is
expected to be a win for x86 generated code performance.
It is also more correct than the code it replaces. I have fuzz tested
this extensively with ISA extensions up through AVX2 and found no
crashes or miscompiles (yet...). The old lowering had a few miscompiles
and crashers after a somewhat smaller amount of fuzz testing.
There is one significant area where the new code path lags behind and
that is in AVX-512 support. However, there was *extremely little*
support for that already and so this isn't a significant step backwards
and the new framework will probably make it easier to implement lowering
that uses the full power of AVX-512's table-based shuffle+blend (IMO).
Many thanks to Quentin, Andrea, Robert, and others for benchmarking
assistance. Thanks to Adam and others for help with AVX-512. Thanks to
Hal, Eric, and *many* others for answering my incessant questions about
how the backend actually works. =]
I will leave the old code path in the tree until the 3 PRs above are at
least resolved to folks' satisfaction. Then I will rip it (and 1000s of
lines of code) out. =] I don't expect this flag to stay around for very
long. It may not survive next week.
llvm-svn: 219046
2014-10-04 11:52:55 +08:00
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
|
2014-10-02 05:07:07 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: vsel_4xi16:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX2: # BB#0: # %entry
|
2014-10-02 05:07:07 +08:00
|
|
|
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
|
|
|
|
; AVX2-NEXT: retq
|
2014-10-02 14:52:19 +08:00
|
|
|
entry:
|
2014-10-02 05:07:07 +08:00
|
|
|
%vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i16> %v1, <4 x i16> %v2
|
|
|
|
ret <4 x i16> %vsel
|
|
|
|
}
|
|
|
|
|
2011-09-09 05:05:43 +08:00
|
|
|
define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-LABEL: vsel_i32:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE2: # BB#0: # %entry
|
[x86,sdag] Two interrelated changes to the x86 and sdag code.
First, don't combine bit masking into vector shuffles (even ones the
target can handle) once operation legalization has taken place. Custom
legalization of vector shuffles may exist for these patterns (making the
predicate return true) but that custom legalization may in some cases
produce the exact bit math this matches. We only really want to handle
this prior to operation legalization.
However, the x86 backend, in a fit of awesome, relied on this. What it
would do is mark VSELECTs as expand, which would turn them into
arithmetic, which this would then match back into vector shuffles, which
we would then lower properly. Amazing.
Instead, the second change is to teach the x86 backend to directly form
vector shuffles from VSELECT nodes with constant conditions, and to mark
all of the vector types we support lowering blends as shuffles as custom
VSELECT lowering. We still mark the forms which actually support
variable blends as *legal* so that the custom lowering is bypassed, and
the legal lowering can even be used by the vector shuffle legalization
(yes, i know, this is confusing. but that's how the patterns are
written).
This makes the VSELECT lowering much more sensible, and in fact should
fix a bunch of bugs with it. However, as you'll see in the test cases,
right now what it does is point out the *hilarious* deficiency of the
new vector shuffle lowering when it comes to blends. Fortunately, my
very next patch fixes that. I can't submit it yet, because that patch,
somewhat obviously, forms the exact and/or pattern that the DAG combine
is matching here! Without this patch, teaching the vector shuffle
lowering to produce the right code infloops in the DAG combiner. With
this patch alone, we produce terrible code but at least lower through
the right paths. With both patches, all the regressions here should be
fixed, and a bunch of the improvements (like using 2 shufps with no
memory loads instead of 2 andps with memory loads and an orps) will
stay. Win!
There is one other change worth noting here. We had hilariously wrong
vectorization cost estimates for vselect because we fell through to the
code path that assumed all "expand" vector operations are scalarized.
However, the "expand" lowering of VSELECT is vector bit math, most
definitely not scalarized. So now we go back to the correct if horribly
naive cost of "1" for "not scalarized". If anyone wants to add actual
modeling of shuffle costs, that would be cool, but this seems an
improvement on its own. Note the removal of 16 and 32 "costs" for doing
a blend. Even in SSE2 we can blend in fewer than 16 instructions. ;] Of
course, we don't right now because of OMG bad code, but I'm going to fix
that. Next patch. I promise.
llvm-svn: 229835
2015-02-19 18:36:19 +08:00
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
|
|
|
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: vsel_i32:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSSE3: # BB#0: # %entry
|
[x86,sdag] Two interrelated changes to the x86 and sdag code.
First, don't combine bit masking into vector shuffles (even ones the
target can handle) once operation legalization has taken place. Custom
legalization of vector shuffles may exist for these patterns (making the
predicate return true) but that custom legalization may in some cases
produce the exact bit math this matches. We only really want to handle
this prior to operation legalization.
However, the x86 backend, in a fit of awesome, relied on this. What it
would do is mark VSELECTs as expand, which would turn them into
arithmetic, which this would then match back into vector shuffles, which
we would then lower properly. Amazing.
Instead, the second change is to teach the x86 backend to directly form
vector shuffles from VSELECT nodes with constant conditions, and to mark
all of the vector types we support lowering blends as shuffles as custom
VSELECT lowering. We still mark the forms which actually support
variable blends as *legal* so that the custom lowering is bypassed, and
the legal lowering can even be used by the vector shuffle legalization
(yes, i know, this is confusing. but that's how the patterns are
written).
This makes the VSELECT lowering much more sensible, and in fact should
fix a bunch of bugs with it. However, as you'll see in the test cases,
right now what it does is point out the *hilarious* deficiency of the
new vector shuffle lowering when it comes to blends. Fortunately, my
very next patch fixes that. I can't submit it yet, because that patch,
somewhat obviously, forms the exact and/or pattern that the DAG combine
is matching here! Without this patch, teaching the vector shuffle
lowering to produce the right code infloops in the DAG combiner. With
this patch alone, we produce terrible code but at least lower through
the right paths. With both patches, all the regressions here should be
fixed, and a bunch of the improvements (like using 2 shufps with no
memory loads instead of 2 andps with memory loads and an orps) will
stay. Win!
There is one other change worth noting here. We had hilariously wrong
vectorization cost estimates for vselect because we fell through to the
code path that assumed all "expand" vector operations are scalarized.
However, the "expand" lowering of VSELECT is vector bit math, most
definitely not scalarized. So now we go back to the correct if horribly
naive cost of "1" for "not scalarized". If anyone wants to add actual
modeling of shuffle costs, that would be cool, but this seems an
improvement on its own. Note the removal of 16 and 32 "costs" for doing
a blend. Even in SSE2 we can blend in fewer than 16 instructions. ;] Of
course, we don't right now because of OMG bad code, but I'm going to fix
that. Next patch. I promise.
llvm-svn: 229835
2015-02-19 18:36:19 +08:00
|
|
|
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
|
|
|
|
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
|
|
|
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: vsel_i32:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE41: # BB#0: # %entry
|
[x86] Enable the new vector shuffle lowering by default.
Update the entire regression test suite for the new shuffles. Remove
most of the old testing which was devoted to the old shuffle lowering
path and is no longer relevant really. Also remove a few other random
tests that only really exercised shuffles and only incidently or without
any interesting aspects to them.
Benchmarking that I have done shows a few small regressions with this on
LNT, zero measurable regressions on real, large applications, and for
several benchmarks where the loop vectorizer fires in the hot path it
shows 5% to 40% improvements for SSE2 and SSE3 code running on Sandy
Bridge machines. Running on AMD machines shows even more dramatic
improvements.
When using newer ISA vector extensions the gains are much more modest,
but the code is still better on the whole. There are a few regressions
being tracked (PR21137, PR21138, PR21139) but by and large this is
expected to be a win for x86 generated code performance.
It is also more correct than the code it replaces. I have fuzz tested
this extensively with ISA extensions up through AVX2 and found no
crashes or miscompiles (yet...). The old lowering had a few miscompiles
and crashers after a somewhat smaller amount of fuzz testing.
There is one significant area where the new code path lags behind and
that is in AVX-512 support. However, there was *extremely little*
support for that already and so this isn't a significant step backwards
and the new framework will probably make it easier to implement lowering
that uses the full power of AVX-512's table-based shuffle+blend (IMO).
Many thanks to Quentin, Andrea, Robert, and others for benchmarking
assistance. Thanks to Adam and others for help with AVX-512. Thanks to
Hal, Eric, and *many* others for answering my incessant questions about
how the backend actually works. =]
I will leave the old code path in the tree until the 3 PRs above are at
least resolved to folks' satisfaction. Then I will rip it (and 1000s of
lines of code) out. =] I don't expect this flag to stay around for very
long. It may not survive next week.
llvm-svn: 219046
2014-10-04 11:52:55 +08:00
|
|
|
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: vsel_i32:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX1: # BB#0: # %entry
|
[x86] Enable the new vector shuffle lowering by default.
Update the entire regression test suite for the new shuffles. Remove
most of the old testing which was devoted to the old shuffle lowering
path and is no longer relevant really. Also remove a few other random
tests that only really exercised shuffles and only incidently or without
any interesting aspects to them.
Benchmarking that I have done shows a few small regressions with this on
LNT, zero measurable regressions on real, large applications, and for
several benchmarks where the loop vectorizer fires in the hot path it
shows 5% to 40% improvements for SSE2 and SSE3 code running on Sandy
Bridge machines. Running on AMD machines shows even more dramatic
improvements.
When using newer ISA vector extensions the gains are much more modest,
but the code is still better on the whole. There are a few regressions
being tracked (PR21137, PR21138, PR21139) but by and large this is
expected to be a win for x86 generated code performance.
It is also more correct than the code it replaces. I have fuzz tested
this extensively with ISA extensions up through AVX2 and found no
crashes or miscompiles (yet...). The old lowering had a few miscompiles
and crashers after a somewhat smaller amount of fuzz testing.
There is one significant area where the new code path lags behind and
that is in AVX-512 support. However, there was *extremely little*
support for that already and so this isn't a significant step backwards
and the new framework will probably make it easier to implement lowering
that uses the full power of AVX-512's table-based shuffle+blend (IMO).
Many thanks to Quentin, Andrea, Robert, and others for benchmarking
assistance. Thanks to Adam and others for help with AVX-512. Thanks to
Hal, Eric, and *many* others for answering my incessant questions about
how the backend actually works. =]
I will leave the old code path in the tree until the 3 PRs above are at
least resolved to folks' satisfaction. Then I will rip it (and 1000s of
lines of code) out. =] I don't expect this flag to stay around for very
long. It may not survive next week.
llvm-svn: 219046
2014-10-04 11:52:55 +08:00
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
|
2014-10-02 05:03:21 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: vsel_i32:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX2: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
|
|
|
|
; AVX2-NEXT: retq
|
2014-10-02 14:52:19 +08:00
|
|
|
entry:
|
[X86] Teach how to combine a vselect into a movss/movsd
Add target specific rules for combining vselect dag nodes into movss/movsd
when possible.
If the vector type of the vselect dag node in input is either MVT::v4i13 or
MVT::v4f32, then try to fold according to rules:
1) fold (vselect (build_vector (0, -1, -1, -1)), A, B) -> (movss A, B)
2) fold (vselect (build_vector (-1, 0, 0, 0)), A, B) -> (movss B, A)
If the vector type of the vselect dag node in input is either MVT::v2i64 or
MVT::v2f64 (and we have SSE2), then try to fold according to rules:
3) fold (vselect (build_vector (0, -1)), A, B) -> (movsd A, B)
4) fold (vselect (build_vector (-1, 0)), A, B) -> (movsd B, A)
llvm-svn: 199683
2014-01-21 03:35:22 +08:00
|
|
|
%vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %v1, <4 x i32> %v2
|
2011-09-09 05:05:43 +08:00
|
|
|
ret <4 x i32> %vsel
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) {
|
2014-11-24 20:23:15 +08:00
|
|
|
; SSE2-LABEL: vsel_double:
|
|
|
|
; SSE2: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
|
2015-02-04 18:58:53 +08:00
|
|
|
; SSE2-NEXT: movapd %xmm1, %xmm0
|
2014-11-24 20:23:15 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: vsel_double:
|
|
|
|
; SSSE3: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
|
2015-02-04 18:58:53 +08:00
|
|
|
; SSSE3-NEXT: movapd %xmm1, %xmm0
|
2014-11-24 20:23:15 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: vsel_double:
|
|
|
|
; SSE41: # BB#0: # %entry
|
|
|
|
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
|
|
|
|
; SSE41-NEXT: retq
|
2014-10-02 05:03:21 +08:00
|
|
|
;
|
|
|
|
; AVX-LABEL: vsel_double:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX: # BB#0: # %entry
|
2014-11-24 20:23:15 +08:00
|
|
|
; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
|
2014-10-02 05:03:21 +08:00
|
|
|
; AVX-NEXT: retq
|
2014-10-02 14:52:19 +08:00
|
|
|
entry:
|
2011-09-09 05:05:43 +08:00
|
|
|
%vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %v1, <2 x double> %v2
|
|
|
|
ret <2 x double> %vsel
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @vsel_i64(<2 x i64> %v1, <2 x i64> %v2) {
|
2014-11-24 20:23:15 +08:00
|
|
|
; SSE2-LABEL: vsel_i64:
|
|
|
|
; SSE2: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
|
2015-02-04 18:58:53 +08:00
|
|
|
; SSE2-NEXT: movapd %xmm1, %xmm0
|
2014-11-24 20:23:15 +08:00
|
|
|
; SSE2-NEXT: retq
|
2014-10-02 05:03:21 +08:00
|
|
|
;
|
2014-11-24 20:23:15 +08:00
|
|
|
; SSSE3-LABEL: vsel_i64:
|
|
|
|
; SSSE3: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
|
2015-02-04 18:58:53 +08:00
|
|
|
; SSSE3-NEXT: movapd %xmm1, %xmm0
|
2014-11-24 20:23:15 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: vsel_i64:
|
|
|
|
; SSE41: # BB#0: # %entry
|
|
|
|
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: vsel_i64:
|
|
|
|
; AVX1: # BB#0: # %entry
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: vsel_i64:
|
|
|
|
; AVX2: # BB#0: # %entry
|
|
|
|
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
|
|
|
|
; AVX2-NEXT: retq
|
2014-10-02 14:52:19 +08:00
|
|
|
entry:
|
2011-09-09 05:05:43 +08:00
|
|
|
%vsel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %v1, <2 x i64> %v2
|
|
|
|
ret <2 x i64> %vsel
|
|
|
|
}
|
|
|
|
|
2014-10-02 04:56:57 +08:00
|
|
|
define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-LABEL: vsel_8xi16:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE2: # BB#0: # %entry
|
2015-02-19 18:46:52 +08:00
|
|
|
; SSE2-NEXT: movaps {{.*#+}} xmm2 = [0,65535,65535,65535,0,65535,65535,65535]
|
|
|
|
; SSE2-NEXT: andps %xmm2, %xmm1
|
|
|
|
; SSE2-NEXT: andnps %xmm0, %xmm2
|
|
|
|
; SSE2-NEXT: orps %xmm1, %xmm2
|
|
|
|
; SSE2-NEXT: movaps %xmm2, %xmm0
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: vsel_8xi16:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSSE3: # BB#0: # %entry
|
2015-02-19 18:46:52 +08:00
|
|
|
; SSSE3-NEXT: movaps {{.*#+}} xmm2 = [0,65535,65535,65535,0,65535,65535,65535]
|
|
|
|
; SSSE3-NEXT: andps %xmm2, %xmm1
|
|
|
|
; SSSE3-NEXT: andnps %xmm0, %xmm2
|
|
|
|
; SSSE3-NEXT: orps %xmm1, %xmm2
|
|
|
|
; SSSE3-NEXT: movaps %xmm2, %xmm0
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: vsel_8xi16:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE41: # BB#0: # %entry
|
2014-11-05 07:25:08 +08:00
|
|
|
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: vsel_8xi16:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
|
|
|
|
; AVX-NEXT: retq
|
2014-10-02 14:52:19 +08:00
|
|
|
entry:
|
2014-10-02 04:56:57 +08:00
|
|
|
%vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i16> %v1, <8 x i16> %v2
|
|
|
|
ret <8 x i16> %vsel
|
|
|
|
}
|
|
|
|
|
2011-09-09 05:05:43 +08:00
|
|
|
define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-LABEL: vsel_i8:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE2: # BB#0: # %entry
|
2015-10-08 16:13:02 +08:00
|
|
|
; SSE2-NEXT: movaps {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
|
|
|
|
; SSE2-NEXT: andps %xmm2, %xmm1
|
|
|
|
; SSE2-NEXT: andnps %xmm0, %xmm2
|
|
|
|
; SSE2-NEXT: orps %xmm1, %xmm2
|
|
|
|
; SSE2-NEXT: movaps %xmm2, %xmm0
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: vsel_i8:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSSE3: # BB#0: # %entry
|
2015-10-08 16:13:02 +08:00
|
|
|
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero
|
|
|
|
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3],zero,xmm1[5,6,7],zero,xmm1[9,10,11],zero,xmm1[13,14,15]
|
2015-02-19 20:10:37 +08:00
|
|
|
; SSSE3-NEXT: por %xmm1, %xmm0
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: vsel_i8:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE41: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE41-NEXT: movdqa %xmm0, %xmm2
|
2015-10-08 16:13:02 +08:00
|
|
|
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
|
|
|
|
; SSE41-NEXT: pblendvb %xmm1, %xmm2
|
|
|
|
; SSE41-NEXT: movdqa %xmm2, %xmm0
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: vsel_i8:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX: # BB#0: # %entry
|
2015-10-08 16:13:02 +08:00
|
|
|
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
|
|
|
|
; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
|
2014-10-02 05:03:21 +08:00
|
|
|
; AVX-NEXT: retq
|
2014-10-02 14:52:19 +08:00
|
|
|
entry:
|
2011-09-09 05:05:43 +08:00
|
|
|
%vsel = select <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <16 x i8> %v1, <16 x i8> %v2
|
|
|
|
ret <16 x i8> %vsel
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-09-10 04:29:17 +08:00
|
|
|
; AVX256 tests:
|
|
|
|
|
|
|
|
define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
|
2014-11-24 20:23:15 +08:00
|
|
|
; SSE2-LABEL: vsel_float8:
|
|
|
|
; SSE2: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
|
|
|
|
; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
|
2014-11-24 20:23:15 +08:00
|
|
|
; SSE2-NEXT: movaps %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: movaps %xmm3, %xmm1
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: vsel_float8:
|
|
|
|
; SSSE3: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
|
|
|
|
; SSSE3-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
|
2014-11-24 20:23:15 +08:00
|
|
|
; SSSE3-NEXT: movaps %xmm2, %xmm0
|
|
|
|
; SSSE3-NEXT: movaps %xmm3, %xmm1
|
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: vsel_float8:
|
|
|
|
; SSE41: # BB#0: # %entry
|
|
|
|
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
|
|
|
|
; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3]
|
|
|
|
; SSE41-NEXT: retq
|
2014-10-02 05:03:21 +08:00
|
|
|
;
|
|
|
|
; AVX-LABEL: vsel_float8:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
|
|
|
|
; AVX-NEXT: retq
|
2014-10-02 14:52:19 +08:00
|
|
|
entry:
|
2011-09-10 04:29:17 +08:00
|
|
|
%vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x float> %v1, <8 x float> %v2
|
|
|
|
ret <8 x float> %vsel
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
|
2014-11-24 20:23:15 +08:00
|
|
|
; SSE2-LABEL: vsel_i328:
|
|
|
|
; SSE2: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
|
|
|
|
; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
|
2014-11-24 20:23:15 +08:00
|
|
|
; SSE2-NEXT: movaps %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: movaps %xmm3, %xmm1
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: vsel_i328:
|
|
|
|
; SSSE3: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
|
|
|
|
; SSSE3-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
|
2014-11-24 20:23:15 +08:00
|
|
|
; SSSE3-NEXT: movaps %xmm2, %xmm0
|
|
|
|
; SSSE3-NEXT: movaps %xmm3, %xmm1
|
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: vsel_i328:
|
|
|
|
; SSE41: # BB#0: # %entry
|
|
|
|
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
|
|
|
|
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5,6,7]
|
|
|
|
; SSE41-NEXT: retq
|
2014-10-02 05:03:21 +08:00
|
|
|
;
|
|
|
|
; AVX1-LABEL: vsel_i328:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX1: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: vsel_i328:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX2: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
|
|
|
|
; AVX2-NEXT: retq
|
2014-10-02 14:52:19 +08:00
|
|
|
entry:
|
2011-09-10 04:29:17 +08:00
|
|
|
%vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i32> %v1, <8 x i32> %v2
|
|
|
|
ret <8 x i32> %vsel
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-LABEL: vsel_double8:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE2: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
|
|
|
|
; SSE2-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1]
|
2015-02-04 18:58:53 +08:00
|
|
|
; SSE2-NEXT: movapd %xmm4, %xmm0
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-NEXT: movaps %xmm5, %xmm1
|
2015-02-04 18:58:53 +08:00
|
|
|
; SSE2-NEXT: movapd %xmm6, %xmm2
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-NEXT: movaps %xmm7, %xmm3
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: vsel_double8:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSSE3: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; SSSE3-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
|
|
|
|
; SSSE3-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1]
|
2015-02-04 18:58:53 +08:00
|
|
|
; SSSE3-NEXT: movapd %xmm4, %xmm0
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSSE3-NEXT: movaps %xmm5, %xmm1
|
2015-02-04 18:58:53 +08:00
|
|
|
; SSSE3-NEXT: movapd %xmm6, %xmm2
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSSE3-NEXT: movaps %xmm7, %xmm3
|
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: vsel_double8:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE41: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm4[1]
|
|
|
|
; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm6[1]
|
2014-11-06 10:25:03 +08:00
|
|
|
; SSE41-NEXT: movaps %xmm5, %xmm1
|
|
|
|
; SSE41-NEXT: movaps %xmm7, %xmm3
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: vsel_double8:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3]
|
|
|
|
; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3]
|
|
|
|
; AVX-NEXT: retq
|
2014-10-02 14:52:19 +08:00
|
|
|
entry:
|
2011-09-10 04:29:17 +08:00
|
|
|
%vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x double> %v1, <8 x double> %v2
|
|
|
|
ret <8 x double> %vsel
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-LABEL: vsel_i648:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE2: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
|
|
|
|
; SSE2-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1]
|
2015-02-04 18:58:53 +08:00
|
|
|
; SSE2-NEXT: movapd %xmm4, %xmm0
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-NEXT: movaps %xmm5, %xmm1
|
2015-02-04 18:58:53 +08:00
|
|
|
; SSE2-NEXT: movapd %xmm6, %xmm2
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-NEXT: movaps %xmm7, %xmm3
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: vsel_i648:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSSE3: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; SSSE3-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
|
|
|
|
; SSSE3-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1]
|
2015-02-04 18:58:53 +08:00
|
|
|
; SSSE3-NEXT: movapd %xmm4, %xmm0
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSSE3-NEXT: movaps %xmm5, %xmm1
|
2015-02-04 18:58:53 +08:00
|
|
|
; SSSE3-NEXT: movapd %xmm6, %xmm2
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSSE3-NEXT: movaps %xmm7, %xmm3
|
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: vsel_i648:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE41: # BB#0: # %entry
|
[x86,sdag] Two interrelated changes to the x86 and sdag code.
First, don't combine bit masking into vector shuffles (even ones the
target can handle) once operation legalization has taken place. Custom
legalization of vector shuffles may exist for these patterns (making the
predicate return true) but that custom legalization may in some cases
produce the exact bit math this matches. We only really want to handle
this prior to operation legalization.
However, the x86 backend, in a fit of awesome, relied on this. What it
would do is mark VSELECTs as expand, which would turn them into
arithmetic, which this would then match back into vector shuffles, which
we would then lower properly. Amazing.
Instead, the second change is to teach the x86 backend to directly form
vector shuffles from VSELECT nodes with constant conditions, and to mark
all of the vector types we support lowering blends as shuffles as custom
VSELECT lowering. We still mark the forms which actually support
variable blends as *legal* so that the custom lowering is bypassed, and
the legal lowering can even be used by the vector shuffle legalization
(yes, i know, this is confusing. but that's how the patterns are
written).
This makes the VSELECT lowering much more sensible, and in fact should
fix a bunch of bugs with it. However, as you'll see in the test cases,
right now what it does is point out the *hilarious* deficiency of the
new vector shuffle lowering when it comes to blends. Fortunately, my
very next patch fixes that. I can't submit it yet, because that patch,
somewhat obviously, forms the exact and/or pattern that the DAG combine
is matching here! Without this patch, teaching the vector shuffle
lowering to produce the right code infloops in the DAG combiner. With
this patch alone, we produce terrible code but at least lower through
the right paths. With both patches, all the regressions here should be
fixed, and a bunch of the improvements (like using 2 shufps with no
memory loads instead of 2 andps with memory loads and an orps) will
stay. Win!
There is one other change worth noting here. We had hilariously wrong
vectorization cost estimates for vselect because we fell through to the
code path that assumed all "expand" vector operations are scalarized.
However, the "expand" lowering of VSELECT is vector bit math, most
definitely not scalarized. So now we go back to the correct if horribly
naive cost of "1" for "not scalarized". If anyone wants to add actual
modeling of shuffle costs, that would be cool, but this seems an
improvement on its own. Note the removal of 16 and 32 "costs" for doing
a blend. Even in SSE2 we can blend in fewer than 16 instructions. ;] Of
course, we don't right now because of OMG bad code, but I'm going to fix
that. Next patch. I promise.
llvm-svn: 229835
2015-02-19 18:36:19 +08:00
|
|
|
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
|
|
|
|
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
|
2014-11-06 10:25:03 +08:00
|
|
|
; SSE41-NEXT: movaps %xmm5, %xmm1
|
|
|
|
; SSE41-NEXT: movaps %xmm7, %xmm3
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
[x86] Enable the new vector shuffle lowering by default.
Update the entire regression test suite for the new shuffles. Remove
most of the old testing which was devoted to the old shuffle lowering
path and is no longer relevant really. Also remove a few other random
tests that only really exercised shuffles and only incidently or without
any interesting aspects to them.
Benchmarking that I have done shows a few small regressions with this on
LNT, zero measurable regressions on real, large applications, and for
several benchmarks where the loop vectorizer fires in the hot path it
shows 5% to 40% improvements for SSE2 and SSE3 code running on Sandy
Bridge machines. Running on AMD machines shows even more dramatic
improvements.
When using newer ISA vector extensions the gains are much more modest,
but the code is still better on the whole. There are a few regressions
being tracked (PR21137, PR21138, PR21139) but by and large this is
expected to be a win for x86 generated code performance.
It is also more correct than the code it replaces. I have fuzz tested
this extensively with ISA extensions up through AVX2 and found no
crashes or miscompiles (yet...). The old lowering had a few miscompiles
and crashers after a somewhat smaller amount of fuzz testing.
There is one significant area where the new code path lags behind and
that is in AVX-512 support. However, there was *extremely little*
support for that already and so this isn't a significant step backwards
and the new framework will probably make it easier to implement lowering
that uses the full power of AVX-512's table-based shuffle+blend (IMO).
Many thanks to Quentin, Andrea, Robert, and others for benchmarking
assistance. Thanks to Adam and others for help with AVX-512. Thanks to
Hal, Eric, and *many* others for answering my incessant questions about
how the backend actually works. =]
I will leave the old code path in the tree until the 3 PRs above are at
least resolved to folks' satisfaction. Then I will rip it (and 1000s of
lines of code) out. =] I don't expect this flag to stay around for very
long. It may not survive next week.
llvm-svn: 219046
2014-10-04 11:52:55 +08:00
|
|
|
; AVX1-LABEL: vsel_i648:
|
|
|
|
; AVX1: # BB#0: # %entry
|
|
|
|
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3]
|
|
|
|
; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3]
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: vsel_i648:
|
|
|
|
; AVX2: # BB#0: # %entry
|
|
|
|
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
|
|
|
|
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
|
|
|
|
; AVX2-NEXT: retq
|
2014-10-02 14:52:19 +08:00
|
|
|
entry:
|
2011-09-10 04:29:17 +08:00
|
|
|
%vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i64> %v1, <8 x i64> %v2
|
|
|
|
ret <8 x i64> %vsel
|
|
|
|
}
|
|
|
|
|
2014-01-28 02:45:30 +08:00
|
|
|
define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) {
|
2014-11-24 20:23:15 +08:00
|
|
|
; SSE2-LABEL: vsel_double4:
|
|
|
|
; SSE2: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
|
|
|
|
; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
|
2015-02-04 18:58:53 +08:00
|
|
|
; SSE2-NEXT: movapd %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: movapd %xmm3, %xmm1
|
2014-11-24 20:23:15 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: vsel_double4:
|
|
|
|
; SSSE3: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
|
|
|
|
; SSSE3-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
|
2015-02-04 18:58:53 +08:00
|
|
|
; SSSE3-NEXT: movapd %xmm2, %xmm0
|
|
|
|
; SSSE3-NEXT: movapd %xmm3, %xmm1
|
2014-11-24 20:23:15 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: vsel_double4:
|
|
|
|
; SSE41: # BB#0: # %entry
|
|
|
|
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
|
|
|
|
; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm3[1]
|
|
|
|
; SSE41-NEXT: retq
|
2014-10-02 05:03:21 +08:00
|
|
|
;
|
|
|
|
; AVX-LABEL: vsel_double4:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
|
|
|
|
; AVX-NEXT: retq
|
2014-10-02 14:52:19 +08:00
|
|
|
entry:
|
2014-01-28 02:45:30 +08:00
|
|
|
%vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %v1, <4 x double> %v2
|
|
|
|
ret <4 x double> %vsel
|
|
|
|
}
|
|
|
|
|
2013-07-19 06:29:15 +08:00
|
|
|
define <2 x double> @testa(<2 x double> %x, <2 x double> %y) {
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-LABEL: testa:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE2: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-NEXT: movapd %xmm1, %xmm2
|
|
|
|
; SSE2-NEXT: cmplepd %xmm0, %xmm2
|
|
|
|
; SSE2-NEXT: andpd %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: andnpd %xmm1, %xmm2
|
|
|
|
; SSE2-NEXT: orpd %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: testa:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSSE3: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSSE3-NEXT: movapd %xmm1, %xmm2
|
|
|
|
; SSSE3-NEXT: cmplepd %xmm0, %xmm2
|
|
|
|
; SSSE3-NEXT: andpd %xmm2, %xmm0
|
|
|
|
; SSSE3-NEXT: andnpd %xmm1, %xmm2
|
|
|
|
; SSSE3-NEXT: orpd %xmm2, %xmm0
|
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: testa:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE41: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE41-NEXT: movapd %xmm0, %xmm2
|
|
|
|
; SSE41-NEXT: movapd %xmm1, %xmm0
|
|
|
|
; SSE41-NEXT: cmplepd %xmm2, %xmm0
|
|
|
|
; SSE41-NEXT: blendvpd %xmm2, %xmm1
|
|
|
|
; SSE41-NEXT: movapd %xmm1, %xmm0
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: testa:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; AVX-NEXT: vcmplepd %xmm0, %xmm1, %xmm2
|
|
|
|
; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2014-10-02 14:52:19 +08:00
|
|
|
entry:
|
2011-09-13 03:30:40 +08:00
|
|
|
%max_is_x = fcmp oge <2 x double> %x, %y
|
|
|
|
%max = select <2 x i1> %max_is_x, <2 x double> %x, <2 x double> %y
|
|
|
|
ret <2 x double> %max
|
|
|
|
}
|
|
|
|
|
2013-07-19 06:29:15 +08:00
|
|
|
define <2 x double> @testb(<2 x double> %x, <2 x double> %y) {
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-LABEL: testb:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE2: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-NEXT: movapd %xmm1, %xmm2
|
|
|
|
; SSE2-NEXT: cmpnlepd %xmm0, %xmm2
|
|
|
|
; SSE2-NEXT: andpd %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: andnpd %xmm1, %xmm2
|
|
|
|
; SSE2-NEXT: orpd %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: testb:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSSE3: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSSE3-NEXT: movapd %xmm1, %xmm2
|
|
|
|
; SSSE3-NEXT: cmpnlepd %xmm0, %xmm2
|
|
|
|
; SSSE3-NEXT: andpd %xmm2, %xmm0
|
|
|
|
; SSSE3-NEXT: andnpd %xmm1, %xmm2
|
|
|
|
; SSSE3-NEXT: orpd %xmm2, %xmm0
|
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: testb:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE41: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE41-NEXT: movapd %xmm0, %xmm2
|
|
|
|
; SSE41-NEXT: movapd %xmm1, %xmm0
|
|
|
|
; SSE41-NEXT: cmpnlepd %xmm2, %xmm0
|
|
|
|
; SSE41-NEXT: blendvpd %xmm2, %xmm1
|
|
|
|
; SSE41-NEXT: movapd %xmm1, %xmm0
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: testb:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; AVX-NEXT: vcmpnlepd %xmm0, %xmm1, %xmm2
|
|
|
|
; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2014-10-02 14:52:19 +08:00
|
|
|
entry:
|
2011-09-18 08:41:38 +08:00
|
|
|
%min_is_x = fcmp ult <2 x double> %x, %y
|
|
|
|
%min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y
|
|
|
|
ret <2 x double> %min
|
2011-09-13 03:30:40 +08:00
|
|
|
}
|
2014-05-27 11:42:20 +08:00
|
|
|
|
|
|
|
; If we can figure out a blend has a constant mask, we should emit the
|
|
|
|
; blend instruction with an immediate mask
|
|
|
|
define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) {
|
2014-11-24 20:23:15 +08:00
|
|
|
; SSE2-LABEL: constant_blendvpd_avx:
|
|
|
|
; SSE2: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
|
2014-11-24 20:23:15 +08:00
|
|
|
; SSE2-NEXT: movaps %xmm2, %xmm0
|
2015-02-04 18:58:53 +08:00
|
|
|
; SSE2-NEXT: movapd %xmm3, %xmm1
|
2014-11-24 20:23:15 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: constant_blendvpd_avx:
|
|
|
|
; SSSE3: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; SSSE3-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
|
2014-11-24 20:23:15 +08:00
|
|
|
; SSSE3-NEXT: movaps %xmm2, %xmm0
|
2015-02-04 18:58:53 +08:00
|
|
|
; SSSE3-NEXT: movapd %xmm3, %xmm1
|
2014-11-24 20:23:15 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: constant_blendvpd_avx:
|
|
|
|
; SSE41: # BB#0: # %entry
|
|
|
|
; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm3[1]
|
|
|
|
; SSE41-NEXT: movaps %xmm2, %xmm0
|
|
|
|
; SSE41-NEXT: retq
|
2014-10-02 05:03:21 +08:00
|
|
|
;
|
|
|
|
; AVX-LABEL: constant_blendvpd_avx:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3]
|
|
|
|
; AVX-NEXT: retq
|
2014-10-02 14:52:19 +08:00
|
|
|
entry:
|
|
|
|
%select = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> %xy, <4 x double> %ab
|
|
|
|
ret <4 x double> %select
|
2014-05-27 11:42:20 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) {
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-LABEL: constant_blendvps_avx:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE2: # BB#0: # %entry
|
[x86,sdag] Two interrelated changes to the x86 and sdag code.
First, don't combine bit masking into vector shuffles (even ones the
target can handle) once operation legalization has taken place. Custom
legalization of vector shuffles may exist for these patterns (making the
predicate return true) but that custom legalization may in some cases
produce the exact bit math this matches. We only really want to handle
this prior to operation legalization.
However, the x86 backend, in a fit of awesome, relied on this. What it
would do is mark VSELECTs as expand, which would turn them into
arithmetic, which this would then match back into vector shuffles, which
we would then lower properly. Amazing.
Instead, the second change is to teach the x86 backend to directly form
vector shuffles from VSELECT nodes with constant conditions, and to mark
all of the vector types we support lowering blends as shuffles as custom
VSELECT lowering. We still mark the forms which actually support
variable blends as *legal* so that the custom lowering is bypassed, and
the legal lowering can even be used by the vector shuffle legalization
(yes, i know, this is confusing. but that's how the patterns are
written).
This makes the VSELECT lowering much more sensible, and in fact should
fix a bunch of bugs with it. However, as you'll see in the test cases,
right now what it does is point out the *hilarious* deficiency of the
new vector shuffle lowering when it comes to blends. Fortunately, my
very next patch fixes that. I can't submit it yet, because that patch,
somewhat obviously, forms the exact and/or pattern that the DAG combine
is matching here! Without this patch, teaching the vector shuffle
lowering to produce the right code infloops in the DAG combiner. With
this patch alone, we produce terrible code but at least lower through
the right paths. With both patches, all the regressions here should be
fixed, and a bunch of the improvements (like using 2 shufps with no
memory loads instead of 2 andps with memory loads and an orps) will
stay. Win!
There is one other change worth noting here. We had hilariously wrong
vectorization cost estimates for vselect because we fell through to the
code path that assumed all "expand" vector operations are scalarized.
However, the "expand" lowering of VSELECT is vector bit math, most
definitely not scalarized. So now we go back to the correct if horribly
naive cost of "1" for "not scalarized". If anyone wants to add actual
modeling of shuffle costs, that would be cool, but this seems an
improvement on its own. Note the removal of 16 and 32 "costs" for doing
a blend. Even in SSE2 we can blend in fewer than 16 instructions. ;] Of
course, we don't right now because of OMG bad code, but I'm going to fix
that. Next patch. I promise.
llvm-svn: 229835
2015-02-19 18:36:19 +08:00
|
|
|
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0]
|
|
|
|
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
|
|
|
|
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0]
|
|
|
|
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0]
|
|
|
|
; SSE2-NEXT: movaps %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: movaps %xmm3, %xmm1
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: constant_blendvps_avx:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSSE3: # BB#0: # %entry
|
[x86,sdag] Two interrelated changes to the x86 and sdag code.
First, don't combine bit masking into vector shuffles (even ones the
target can handle) once operation legalization has taken place. Custom
legalization of vector shuffles may exist for these patterns (making the
predicate return true) but that custom legalization may in some cases
produce the exact bit math this matches. We only really want to handle
this prior to operation legalization.
However, the x86 backend, in a fit of awesome, relied on this. What it
would do is mark VSELECTs as expand, which would turn them into
arithmetic, which this would then match back into vector shuffles, which
we would then lower properly. Amazing.
Instead, the second change is to teach the x86 backend to directly form
vector shuffles from VSELECT nodes with constant conditions, and to mark
all of the vector types we support lowering blends as shuffles as custom
VSELECT lowering. We still mark the forms which actually support
variable blends as *legal* so that the custom lowering is bypassed, and
the legal lowering can even be used by the vector shuffle legalization
(yes, i know, this is confusing. but that's how the patterns are
written).
This makes the VSELECT lowering much more sensible, and in fact should
fix a bunch of bugs with it. However, as you'll see in the test cases,
right now what it does is point out the *hilarious* deficiency of the
new vector shuffle lowering when it comes to blends. Fortunately, my
very next patch fixes that. I can't submit it yet, because that patch,
somewhat obviously, forms the exact and/or pattern that the DAG combine
is matching here! Without this patch, teaching the vector shuffle
lowering to produce the right code infloops in the DAG combiner. With
this patch alone, we produce terrible code but at least lower through
the right paths. With both patches, all the regressions here should be
fixed, and a bunch of the improvements (like using 2 shufps with no
memory loads instead of 2 andps with memory loads and an orps) will
stay. Win!
There is one other change worth noting here. We had hilariously wrong
vectorization cost estimates for vselect because we fell through to the
code path that assumed all "expand" vector operations are scalarized.
However, the "expand" lowering of VSELECT is vector bit math, most
definitely not scalarized. So now we go back to the correct if horribly
naive cost of "1" for "not scalarized". If anyone wants to add actual
modeling of shuffle costs, that would be cool, but this seems an
improvement on its own. Note the removal of 16 and 32 "costs" for doing
a blend. Even in SSE2 we can blend in fewer than 16 instructions. ;] Of
course, we don't right now because of OMG bad code, but I'm going to fix
that. Next patch. I promise.
llvm-svn: 229835
2015-02-19 18:36:19 +08:00
|
|
|
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0]
|
|
|
|
; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
|
|
|
|
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0]
|
|
|
|
; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0]
|
|
|
|
; SSSE3-NEXT: movaps %xmm2, %xmm0
|
|
|
|
; SSSE3-NEXT: movaps %xmm3, %xmm1
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: constant_blendvps_avx:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE41: # BB#0: # %entry
|
2014-11-05 07:25:08 +08:00
|
|
|
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
|
|
|
|
; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: constant_blendvps_avx:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
|
|
|
|
; AVX-NEXT: retq
|
2014-10-02 14:52:19 +08:00
|
|
|
entry:
|
|
|
|
%select = select <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <8 x float> %xyzw, <8 x float> %abcd
|
|
|
|
ret <8 x float> %select
|
2014-05-27 11:42:20 +08:00
|
|
|
}
|
|
|
|
|
2014-10-02 05:07:07 +08:00
|
|
|
define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
|
|
|
|
; SSE2-LABEL: constant_pblendvb_avx2:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE2: # BB#0: # %entry
|
2015-10-08 16:13:02 +08:00
|
|
|
; SSE2-NEXT: movaps {{.*#+}} xmm4 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
|
2015-02-19 20:10:37 +08:00
|
|
|
; SSE2-NEXT: movaps %xmm4, %xmm5
|
2015-10-08 16:13:02 +08:00
|
|
|
; SSE2-NEXT: andnps %xmm0, %xmm5
|
|
|
|
; SSE2-NEXT: andps %xmm4, %xmm2
|
|
|
|
; SSE2-NEXT: orps %xmm2, %xmm5
|
|
|
|
; SSE2-NEXT: andps %xmm4, %xmm3
|
|
|
|
; SSE2-NEXT: andnps %xmm1, %xmm4
|
|
|
|
; SSE2-NEXT: orps %xmm3, %xmm4
|
2016-02-02 05:06:32 +08:00
|
|
|
; SSE2-NEXT: movaps %xmm5, %xmm0
|
|
|
|
; SSE2-NEXT: movaps %xmm4, %xmm1
|
2014-10-02 05:07:07 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: constant_pblendvb_avx2:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSSE3: # BB#0: # %entry
|
2015-10-08 16:13:02 +08:00
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [128,128,2,128,4,5,6,128,128,128,10,128,12,13,14,128]
|
|
|
|
; SSSE3-NEXT: pshufb %xmm4, %xmm0
|
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [0,1,128,3,128,128,128,7,8,9,128,11,128,128,128,15]
|
|
|
|
; SSSE3-NEXT: pshufb %xmm5, %xmm2
|
[x86,sdag] Two interrelated changes to the x86 and sdag code.
First, don't combine bit masking into vector shuffles (even ones the
target can handle) once operation legalization has taken place. Custom
legalization of vector shuffles may exist for these patterns (making the
predicate return true) but that custom legalization may in some cases
produce the exact bit math this matches. We only really want to handle
this prior to operation legalization.
However, the x86 backend, in a fit of awesome, relied on this. What it
would do is mark VSELECTs as expand, which would turn them into
arithmetic, which this would then match back into vector shuffles, which
we would then lower properly. Amazing.
Instead, the second change is to teach the x86 backend to directly form
vector shuffles from VSELECT nodes with constant conditions, and to mark
all of the vector types we support lowering blends as shuffles as custom
VSELECT lowering. We still mark the forms which actually support
variable blends as *legal* so that the custom lowering is bypassed, and
the legal lowering can even be used by the vector shuffle legalization
(yes, i know, this is confusing. but that's how the patterns are
written).
This makes the VSELECT lowering much more sensible, and in fact should
fix a bunch of bugs with it. However, as you'll see in the test cases,
right now what it does is point out the *hilarious* deficiency of the
new vector shuffle lowering when it comes to blends. Fortunately, my
very next patch fixes that. I can't submit it yet, because that patch,
somewhat obviously, forms the exact and/or pattern that the DAG combine
is matching here! Without this patch, teaching the vector shuffle
lowering to produce the right code infloops in the DAG combiner. With
this patch alone, we produce terrible code but at least lower through
the right paths. With both patches, all the regressions here should be
fixed, and a bunch of the improvements (like using 2 shufps with no
memory loads instead of 2 andps with memory loads and an orps) will
stay. Win!
There is one other change worth noting here. We had hilariously wrong
vectorization cost estimates for vselect because we fell through to the
code path that assumed all "expand" vector operations are scalarized.
However, the "expand" lowering of VSELECT is vector bit math, most
definitely not scalarized. So now we go back to the correct if horribly
naive cost of "1" for "not scalarized". If anyone wants to add actual
modeling of shuffle costs, that would be cool, but this seems an
improvement on its own. Note the removal of 16 and 32 "costs" for doing
a blend. Even in SSE2 we can blend in fewer than 16 instructions. ;] Of
course, we don't right now because of OMG bad code, but I'm going to fix
that. Next patch. I promise.
llvm-svn: 229835
2015-02-19 18:36:19 +08:00
|
|
|
; SSSE3-NEXT: por %xmm2, %xmm0
|
2015-10-08 16:13:02 +08:00
|
|
|
; SSSE3-NEXT: pshufb %xmm4, %xmm1
|
|
|
|
; SSSE3-NEXT: pshufb %xmm5, %xmm3
|
[x86,sdag] Two interrelated changes to the x86 and sdag code.
First, don't combine bit masking into vector shuffles (even ones the
target can handle) once operation legalization has taken place. Custom
legalization of vector shuffles may exist for these patterns (making the
predicate return true) but that custom legalization may in some cases
produce the exact bit math this matches. We only really want to handle
this prior to operation legalization.
However, the x86 backend, in a fit of awesome, relied on this. What it
would do is mark VSELECTs as expand, which would turn them into
arithmetic, which this would then match back into vector shuffles, which
we would then lower properly. Amazing.
Instead, the second change is to teach the x86 backend to directly form
vector shuffles from VSELECT nodes with constant conditions, and to mark
all of the vector types we support lowering blends as shuffles as custom
VSELECT lowering. We still mark the forms which actually support
variable blends as *legal* so that the custom lowering is bypassed, and
the legal lowering can even be used by the vector shuffle legalization
(yes, i know, this is confusing. but that's how the patterns are
written).
This makes the VSELECT lowering much more sensible, and in fact should
fix a bunch of bugs with it. However, as you'll see in the test cases,
right now what it does is point out the *hilarious* deficiency of the
new vector shuffle lowering when it comes to blends. Fortunately, my
very next patch fixes that. I can't submit it yet, because that patch,
somewhat obviously, forms the exact and/or pattern that the DAG combine
is matching here! Without this patch, teaching the vector shuffle
lowering to produce the right code infloops in the DAG combiner. With
this patch alone, we produce terrible code but at least lower through
the right paths. With both patches, all the regressions here should be
fixed, and a bunch of the improvements (like using 2 shufps with no
memory loads instead of 2 andps with memory loads and an orps) will
stay. Win!
There is one other change worth noting here. We had hilariously wrong
vectorization cost estimates for vselect because we fell through to the
code path that assumed all "expand" vector operations are scalarized.
However, the "expand" lowering of VSELECT is vector bit math, most
definitely not scalarized. So now we go back to the correct if horribly
naive cost of "1" for "not scalarized". If anyone wants to add actual
modeling of shuffle costs, that would be cool, but this seems an
improvement on its own. Note the removal of 16 and 32 "costs" for doing
a blend. Even in SSE2 we can blend in fewer than 16 instructions. ;] Of
course, we don't right now because of OMG bad code, but I'm going to fix
that. Next patch. I promise.
llvm-svn: 229835
2015-02-19 18:36:19 +08:00
|
|
|
; SSSE3-NEXT: por %xmm3, %xmm1
|
2014-10-02 05:07:07 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: constant_pblendvb_avx2:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE41: # BB#0: # %entry
|
2014-10-02 05:07:07 +08:00
|
|
|
; SSE41-NEXT: movdqa %xmm0, %xmm4
|
2015-10-08 16:13:02 +08:00
|
|
|
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
|
|
|
|
; SSE41-NEXT: pblendvb %xmm2, %xmm4
|
|
|
|
; SSE41-NEXT: pblendvb %xmm3, %xmm1
|
|
|
|
; SSE41-NEXT: movdqa %xmm4, %xmm0
|
2014-10-02 05:07:07 +08:00
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: constant_pblendvb_avx2:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX1: # BB#0: # %entry
|
2016-02-17 18:50:06 +08:00
|
|
|
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
|
|
|
|
; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
|
|
|
|
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
|
|
|
|
; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
|
2014-10-02 05:07:07 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: constant_pblendvb_avx2:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX2: # BB#0: # %entry
|
2015-10-08 16:13:02 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
|
2014-10-02 05:07:07 +08:00
|
|
|
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2014-10-02 14:52:19 +08:00
|
|
|
entry:
|
|
|
|
%select = select <32 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <32 x i8> %xyzw, <32 x i8> %abcd
|
|
|
|
ret <32 x i8> %select
|
2014-10-02 05:07:07 +08:00
|
|
|
}
|
|
|
|
|
2014-05-27 11:42:20 +08:00
|
|
|
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
|
|
|
|
declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)
|
2014-05-30 06:04:42 +08:00
|
|
|
|
|
|
|
;; 4 tests for shufflevectors that optimize to blend + immediate
|
|
|
|
define <4 x float> @blend_shufflevector_4xfloat(<4 x float> %a, <4 x float> %b) {
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-LABEL: blend_shufflevector_4xfloat:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE2: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
|
[x86] Enable the new vector shuffle lowering by default.
Update the entire regression test suite for the new shuffles. Remove
most of the old testing which was devoted to the old shuffle lowering
path and is no longer relevant really. Also remove a few other random
tests that only really exercised shuffles and only incidently or without
any interesting aspects to them.
Benchmarking that I have done shows a few small regressions with this on
LNT, zero measurable regressions on real, large applications, and for
several benchmarks where the loop vectorizer fires in the hot path it
shows 5% to 40% improvements for SSE2 and SSE3 code running on Sandy
Bridge machines. Running on AMD machines shows even more dramatic
improvements.
When using newer ISA vector extensions the gains are much more modest,
but the code is still better on the whole. There are a few regressions
being tracked (PR21137, PR21138, PR21139) but by and large this is
expected to be a win for x86 generated code performance.
It is also more correct than the code it replaces. I have fuzz tested
this extensively with ISA extensions up through AVX2 and found no
crashes or miscompiles (yet...). The old lowering had a few miscompiles
and crashers after a somewhat smaller amount of fuzz testing.
There is one significant area where the new code path lags behind and
that is in AVX-512 support. However, there was *extremely little*
support for that already and so this isn't a significant step backwards
and the new framework will probably make it easier to implement lowering
that uses the full power of AVX-512's table-based shuffle+blend (IMO).
Many thanks to Quentin, Andrea, Robert, and others for benchmarking
assistance. Thanks to Adam and others for help with AVX-512. Thanks to
Hal, Eric, and *many* others for answering my incessant questions about
how the backend actually works. =]
I will leave the old code path in the tree until the 3 PRs above are at
least resolved to folks' satisfaction. Then I will rip it (and 1000s of
lines of code) out. =] I don't expect this flag to stay around for very
long. It may not survive next week.
llvm-svn: 219046
2014-10-04 11:52:55 +08:00
|
|
|
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: blend_shufflevector_4xfloat:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSSE3: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
|
[x86] Enable the new vector shuffle lowering by default.
Update the entire regression test suite for the new shuffles. Remove
most of the old testing which was devoted to the old shuffle lowering
path and is no longer relevant really. Also remove a few other random
tests that only really exercised shuffles and only incidently or without
any interesting aspects to them.
Benchmarking that I have done shows a few small regressions with this on
LNT, zero measurable regressions on real, large applications, and for
several benchmarks where the loop vectorizer fires in the hot path it
shows 5% to 40% improvements for SSE2 and SSE3 code running on Sandy
Bridge machines. Running on AMD machines shows even more dramatic
improvements.
When using newer ISA vector extensions the gains are much more modest,
but the code is still better on the whole. There are a few regressions
being tracked (PR21137, PR21138, PR21139) but by and large this is
expected to be a win for x86 generated code performance.
It is also more correct than the code it replaces. I have fuzz tested
this extensively with ISA extensions up through AVX2 and found no
crashes or miscompiles (yet...). The old lowering had a few miscompiles
and crashers after a somewhat smaller amount of fuzz testing.
There is one significant area where the new code path lags behind and
that is in AVX-512 support. However, there was *extremely little*
support for that already and so this isn't a significant step backwards
and the new framework will probably make it easier to implement lowering
that uses the full power of AVX-512's table-based shuffle+blend (IMO).
Many thanks to Quentin, Andrea, Robert, and others for benchmarking
assistance. Thanks to Adam and others for help with AVX-512. Thanks to
Hal, Eric, and *many* others for answering my incessant questions about
how the backend actually works. =]
I will leave the old code path in the tree until the 3 PRs above are at
least resolved to folks' satisfaction. Then I will rip it (and 1000s of
lines of code) out. =] I don't expect this flag to stay around for very
long. It may not survive next week.
llvm-svn: 219046
2014-10-04 11:52:55 +08:00
|
|
|
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: blend_shufflevector_4xfloat:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE41: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: blend_shufflevector_4xfloat:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
|
|
|
|
; AVX-NEXT: retq
|
2014-10-02 14:52:19 +08:00
|
|
|
entry:
|
|
|
|
%select = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
|
|
|
|
ret <4 x float> %select
|
2014-05-30 06:04:42 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x float> @blend_shufflevector_8xfloat(<8 x float> %a, <8 x float> %b) {
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-LABEL: blend_shufflevector_8xfloat:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE2: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0]
|
|
|
|
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2]
|
|
|
|
; SSE2-NEXT: movaps %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: movaps %xmm3, %xmm1
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: blend_shufflevector_8xfloat:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSSE3: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0]
|
|
|
|
; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2]
|
|
|
|
; SSSE3-NEXT: movaps %xmm2, %xmm0
|
|
|
|
; SSSE3-NEXT: movaps %xmm3, %xmm1
|
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: blend_shufflevector_8xfloat:
|
2014-10-02 14:52:19 +08:00
|
|
|
; SSE41: # BB#0: # %entry
|
2014-11-05 07:25:08 +08:00
|
|
|
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
|
|
|
|
; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3]
|
2014-10-02 05:03:21 +08:00
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: blend_shufflevector_8xfloat:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7]
|
|
|
|
; AVX-NEXT: retq
|
2014-10-02 14:52:19 +08:00
|
|
|
entry:
|
|
|
|
%select = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 15>
|
|
|
|
ret <8 x float> %select
|
2014-05-30 06:04:42 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x double> @blend_shufflevector_4xdouble(<4 x double> %a, <4 x double> %b) {
|
[x86] Enable the new vector shuffle lowering by default.
Update the entire regression test suite for the new shuffles. Remove
most of the old testing which was devoted to the old shuffle lowering
path and is no longer relevant really. Also remove a few other random
tests that only really exercised shuffles and only incidently or without
any interesting aspects to them.
Benchmarking that I have done shows a few small regressions with this on
LNT, zero measurable regressions on real, large applications, and for
several benchmarks where the loop vectorizer fires in the hot path it
shows 5% to 40% improvements for SSE2 and SSE3 code running on Sandy
Bridge machines. Running on AMD machines shows even more dramatic
improvements.
When using newer ISA vector extensions the gains are much more modest,
but the code is still better on the whole. There are a few regressions
being tracked (PR21137, PR21138, PR21139) but by and large this is
expected to be a win for x86 generated code performance.
It is also more correct than the code it replaces. I have fuzz tested
this extensively with ISA extensions up through AVX2 and found no
crashes or miscompiles (yet...). The old lowering had a few miscompiles
and crashers after a somewhat smaller amount of fuzz testing.
There is one significant area where the new code path lags behind and
that is in AVX-512 support. However, there was *extremely little*
support for that already and so this isn't a significant step backwards
and the new framework will probably make it easier to implement lowering
that uses the full power of AVX-512's table-based shuffle+blend (IMO).
Many thanks to Quentin, Andrea, Robert, and others for benchmarking
assistance. Thanks to Adam and others for help with AVX-512. Thanks to
Hal, Eric, and *many* others for answering my incessant questions about
how the backend actually works. =]
I will leave the old code path in the tree until the 3 PRs above are at
least resolved to folks' satisfaction. Then I will rip it (and 1000s of
lines of code) out. =] I don't expect this flag to stay around for very
long. It may not survive next week.
llvm-svn: 219046
2014-10-04 11:52:55 +08:00
|
|
|
; SSE2-LABEL: blend_shufflevector_4xdouble:
|
|
|
|
; SSE2: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
|
2015-02-04 18:58:53 +08:00
|
|
|
; SSE2-NEXT: movapd %xmm2, %xmm0
|
[x86] Enable the new vector shuffle lowering by default.
Update the entire regression test suite for the new shuffles. Remove
most of the old testing which was devoted to the old shuffle lowering
path and is no longer relevant really. Also remove a few other random
tests that only really exercised shuffles and only incidently or without
any interesting aspects to them.
Benchmarking that I have done shows a few small regressions with this on
LNT, zero measurable regressions on real, large applications, and for
several benchmarks where the loop vectorizer fires in the hot path it
shows 5% to 40% improvements for SSE2 and SSE3 code running on Sandy
Bridge machines. Running on AMD machines shows even more dramatic
improvements.
When using newer ISA vector extensions the gains are much more modest,
but the code is still better on the whole. There are a few regressions
being tracked (PR21137, PR21138, PR21139) but by and large this is
expected to be a win for x86 generated code performance.
It is also more correct than the code it replaces. I have fuzz tested
this extensively with ISA extensions up through AVX2 and found no
crashes or miscompiles (yet...). The old lowering had a few miscompiles
and crashers after a somewhat smaller amount of fuzz testing.
There is one significant area where the new code path lags behind and
that is in AVX-512 support. However, there was *extremely little*
support for that already and so this isn't a significant step backwards
and the new framework will probably make it easier to implement lowering
that uses the full power of AVX-512's table-based shuffle+blend (IMO).
Many thanks to Quentin, Andrea, Robert, and others for benchmarking
assistance. Thanks to Adam and others for help with AVX-512. Thanks to
Hal, Eric, and *many* others for answering my incessant questions about
how the backend actually works. =]
I will leave the old code path in the tree until the 3 PRs above are at
least resolved to folks' satisfaction. Then I will rip it (and 1000s of
lines of code) out. =] I don't expect this flag to stay around for very
long. It may not survive next week.
llvm-svn: 219046
2014-10-04 11:52:55 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: blend_shufflevector_4xdouble:
|
|
|
|
; SSSE3: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
|
2015-02-04 18:58:53 +08:00
|
|
|
; SSSE3-NEXT: movapd %xmm2, %xmm0
|
[x86] Enable the new vector shuffle lowering by default.
Update the entire regression test suite for the new shuffles. Remove
most of the old testing which was devoted to the old shuffle lowering
path and is no longer relevant really. Also remove a few other random
tests that only really exercised shuffles and only incidently or without
any interesting aspects to them.
Benchmarking that I have done shows a few small regressions with this on
LNT, zero measurable regressions on real, large applications, and for
several benchmarks where the loop vectorizer fires in the hot path it
shows 5% to 40% improvements for SSE2 and SSE3 code running on Sandy
Bridge machines. Running on AMD machines shows even more dramatic
improvements.
When using newer ISA vector extensions the gains are much more modest,
but the code is still better on the whole. There are a few regressions
being tracked (PR21137, PR21138, PR21139) but by and large this is
expected to be a win for x86 generated code performance.
It is also more correct than the code it replaces. I have fuzz tested
this extensively with ISA extensions up through AVX2 and found no
crashes or miscompiles (yet...). The old lowering had a few miscompiles
and crashers after a somewhat smaller amount of fuzz testing.
There is one significant area where the new code path lags behind and
that is in AVX-512 support. However, there was *extremely little*
support for that already and so this isn't a significant step backwards
and the new framework will probably make it easier to implement lowering
that uses the full power of AVX-512's table-based shuffle+blend (IMO).
Many thanks to Quentin, Andrea, Robert, and others for benchmarking
assistance. Thanks to Adam and others for help with AVX-512. Thanks to
Hal, Eric, and *many* others for answering my incessant questions about
how the backend actually works. =]
I will leave the old code path in the tree until the 3 PRs above are at
least resolved to folks' satisfaction. Then I will rip it (and 1000s of
lines of code) out. =] I don't expect this flag to stay around for very
long. It may not survive next week.
llvm-svn: 219046
2014-10-04 11:52:55 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: blend_shufflevector_4xdouble:
|
|
|
|
; SSE41: # BB#0: # %entry
|
|
|
|
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
|
|
|
|
; SSE41-NEXT: retq
|
2014-10-02 05:03:21 +08:00
|
|
|
;
|
|
|
|
; AVX-LABEL: blend_shufflevector_4xdouble:
|
2014-10-02 14:52:19 +08:00
|
|
|
; AVX: # BB#0: # %entry
|
2014-10-02 05:03:21 +08:00
|
|
|
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
|
|
|
|
; AVX-NEXT: retq
|
2014-10-02 14:52:19 +08:00
|
|
|
entry:
|
|
|
|
%select = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
|
|
|
|
ret <4 x double> %select
|
2014-05-30 06:04:42 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i64> @blend_shufflevector_4xi64(<4 x i64> %a, <4 x i64> %b) {
|
[x86] Enable the new vector shuffle lowering by default.
Update the entire regression test suite for the new shuffles. Remove
most of the old testing which was devoted to the old shuffle lowering
path and is no longer relevant really. Also remove a few other random
tests that only really exercised shuffles and only incidently or without
any interesting aspects to them.
Benchmarking that I have done shows a few small regressions with this on
LNT, zero measurable regressions on real, large applications, and for
several benchmarks where the loop vectorizer fires in the hot path it
shows 5% to 40% improvements for SSE2 and SSE3 code running on Sandy
Bridge machines. Running on AMD machines shows even more dramatic
improvements.
When using newer ISA vector extensions the gains are much more modest,
but the code is still better on the whole. There are a few regressions
being tracked (PR21137, PR21138, PR21139) but by and large this is
expected to be a win for x86 generated code performance.
It is also more correct than the code it replaces. I have fuzz tested
this extensively with ISA extensions up through AVX2 and found no
crashes or miscompiles (yet...). The old lowering had a few miscompiles
and crashers after a somewhat smaller amount of fuzz testing.
There is one significant area where the new code path lags behind and
that is in AVX-512 support. However, there was *extremely little*
support for that already and so this isn't a significant step backwards
and the new framework will probably make it easier to implement lowering
that uses the full power of AVX-512's table-based shuffle+blend (IMO).
Many thanks to Quentin, Andrea, Robert, and others for benchmarking
assistance. Thanks to Adam and others for help with AVX-512. Thanks to
Hal, Eric, and *many* others for answering my incessant questions about
how the backend actually works. =]
I will leave the old code path in the tree until the 3 PRs above are at
least resolved to folks' satisfaction. Then I will rip it (and 1000s of
lines of code) out. =] I don't expect this flag to stay around for very
long. It may not survive next week.
llvm-svn: 219046
2014-10-04 11:52:55 +08:00
|
|
|
; SSE2-LABEL: blend_shufflevector_4xi64:
|
|
|
|
; SSE2: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
|
[x86] Enable the new vector shuffle lowering by default.
Update the entire regression test suite for the new shuffles. Remove
most of the old testing which was devoted to the old shuffle lowering
path and is no longer relevant really. Also remove a few other random
tests that only really exercised shuffles and only incidently or without
any interesting aspects to them.
Benchmarking that I have done shows a few small regressions with this on
LNT, zero measurable regressions on real, large applications, and for
several benchmarks where the loop vectorizer fires in the hot path it
shows 5% to 40% improvements for SSE2 and SSE3 code running on Sandy
Bridge machines. Running on AMD machines shows even more dramatic
improvements.
When using newer ISA vector extensions the gains are much more modest,
but the code is still better on the whole. There are a few regressions
being tracked (PR21137, PR21138, PR21139) but by and large this is
expected to be a win for x86 generated code performance.
It is also more correct than the code it replaces. I have fuzz tested
this extensively with ISA extensions up through AVX2 and found no
crashes or miscompiles (yet...). The old lowering had a few miscompiles
and crashers after a somewhat smaller amount of fuzz testing.
There is one significant area where the new code path lags behind and
that is in AVX-512 support. However, there was *extremely little*
support for that already and so this isn't a significant step backwards
and the new framework will probably make it easier to implement lowering
that uses the full power of AVX-512's table-based shuffle+blend (IMO).
Many thanks to Quentin, Andrea, Robert, and others for benchmarking
assistance. Thanks to Adam and others for help with AVX-512. Thanks to
Hal, Eric, and *many* others for answering my incessant questions about
how the backend actually works. =]
I will leave the old code path in the tree until the 3 PRs above are at
least resolved to folks' satisfaction. Then I will rip it (and 1000s of
lines of code) out. =] I don't expect this flag to stay around for very
long. It may not survive next week.
llvm-svn: 219046
2014-10-04 11:52:55 +08:00
|
|
|
; SSE2-NEXT: movaps %xmm3, %xmm1
|
|
|
|
; SSE2-NEXT: retq
|
2014-10-02 05:03:21 +08:00
|
|
|
;
|
[x86] Enable the new vector shuffle lowering by default.
Update the entire regression test suite for the new shuffles. Remove
most of the old testing which was devoted to the old shuffle lowering
path and is no longer relevant really. Also remove a few other random
tests that only really exercised shuffles and only incidently or without
any interesting aspects to them.
Benchmarking that I have done shows a few small regressions with this on
LNT, zero measurable regressions on real, large applications, and for
several benchmarks where the loop vectorizer fires in the hot path it
shows 5% to 40% improvements for SSE2 and SSE3 code running on Sandy
Bridge machines. Running on AMD machines shows even more dramatic
improvements.
When using newer ISA vector extensions the gains are much more modest,
but the code is still better on the whole. There are a few regressions
being tracked (PR21137, PR21138, PR21139) but by and large this is
expected to be a win for x86 generated code performance.
It is also more correct than the code it replaces. I have fuzz tested
this extensively with ISA extensions up through AVX2 and found no
crashes or miscompiles (yet...). The old lowering had a few miscompiles
and crashers after a somewhat smaller amount of fuzz testing.
There is one significant area where the new code path lags behind and
that is in AVX-512 support. However, there was *extremely little*
support for that already and so this isn't a significant step backwards
and the new framework will probably make it easier to implement lowering
that uses the full power of AVX-512's table-based shuffle+blend (IMO).
Many thanks to Quentin, Andrea, Robert, and others for benchmarking
assistance. Thanks to Adam and others for help with AVX-512. Thanks to
Hal, Eric, and *many* others for answering my incessant questions about
how the backend actually works. =]
I will leave the old code path in the tree until the 3 PRs above are at
least resolved to folks' satisfaction. Then I will rip it (and 1000s of
lines of code) out. =] I don't expect this flag to stay around for very
long. It may not survive next week.
llvm-svn: 219046
2014-10-04 11:52:55 +08:00
|
|
|
; SSSE3-LABEL: blend_shufflevector_4xi64:
|
|
|
|
; SSSE3: # BB#0: # %entry
|
2015-02-04 18:46:53 +08:00
|
|
|
; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
|
[x86] Enable the new vector shuffle lowering by default.
Update the entire regression test suite for the new shuffles. Remove
most of the old testing which was devoted to the old shuffle lowering
path and is no longer relevant really. Also remove a few other random
tests that only really exercised shuffles and only incidently or without
any interesting aspects to them.
Benchmarking that I have done shows a few small regressions with this on
LNT, zero measurable regressions on real, large applications, and for
several benchmarks where the loop vectorizer fires in the hot path it
shows 5% to 40% improvements for SSE2 and SSE3 code running on Sandy
Bridge machines. Running on AMD machines shows even more dramatic
improvements.
When using newer ISA vector extensions the gains are much more modest,
but the code is still better on the whole. There are a few regressions
being tracked (PR21137, PR21138, PR21139) but by and large this is
expected to be a win for x86 generated code performance.
It is also more correct than the code it replaces. I have fuzz tested
this extensively with ISA extensions up through AVX2 and found no
crashes or miscompiles (yet...). The old lowering had a few miscompiles
and crashers after a somewhat smaller amount of fuzz testing.
There is one significant area where the new code path lags behind and
that is in AVX-512 support. However, there was *extremely little*
support for that already and so this isn't a significant step backwards
and the new framework will probably make it easier to implement lowering
that uses the full power of AVX-512's table-based shuffle+blend (IMO).
Many thanks to Quentin, Andrea, Robert, and others for benchmarking
assistance. Thanks to Adam and others for help with AVX-512. Thanks to
Hal, Eric, and *many* others for answering my incessant questions about
how the backend actually works. =]
I will leave the old code path in the tree until the 3 PRs above are at
least resolved to folks' satisfaction. Then I will rip it (and 1000s of
lines of code) out. =] I don't expect this flag to stay around for very
long. It may not survive next week.
llvm-svn: 219046
2014-10-04 11:52:55 +08:00
|
|
|
; SSSE3-NEXT: movaps %xmm3, %xmm1
|
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: blend_shufflevector_4xi64:
|
|
|
|
; SSE41: # BB#0: # %entry
|
2014-11-05 07:25:08 +08:00
|
|
|
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
|
[x86] Enable the new vector shuffle lowering by default.
Update the entire regression test suite for the new shuffles. Remove
most of the old testing which was devoted to the old shuffle lowering
path and is no longer relevant really. Also remove a few other random
tests that only really exercised shuffles and only incidently or without
any interesting aspects to them.
Benchmarking that I have done shows a few small regressions with this on
LNT, zero measurable regressions on real, large applications, and for
several benchmarks where the loop vectorizer fires in the hot path it
shows 5% to 40% improvements for SSE2 and SSE3 code running on Sandy
Bridge machines. Running on AMD machines shows even more dramatic
improvements.
When using newer ISA vector extensions the gains are much more modest,
but the code is still better on the whole. There are a few regressions
being tracked (PR21137, PR21138, PR21139) but by and large this is
expected to be a win for x86 generated code performance.
It is also more correct than the code it replaces. I have fuzz tested
this extensively with ISA extensions up through AVX2 and found no
crashes or miscompiles (yet...). The old lowering had a few miscompiles
and crashers after a somewhat smaller amount of fuzz testing.
There is one significant area where the new code path lags behind and
that is in AVX-512 support. However, there was *extremely little*
support for that already and so this isn't a significant step backwards
and the new framework will probably make it easier to implement lowering
that uses the full power of AVX-512's table-based shuffle+blend (IMO).
Many thanks to Quentin, Andrea, Robert, and others for benchmarking
assistance. Thanks to Adam and others for help with AVX-512. Thanks to
Hal, Eric, and *many* others for answering my incessant questions about
how the backend actually works. =]
I will leave the old code path in the tree until the 3 PRs above are at
least resolved to folks' satisfaction. Then I will rip it (and 1000s of
lines of code) out. =] I don't expect this flag to stay around for very
long. It may not survive next week.
llvm-svn: 219046
2014-10-04 11:52:55 +08:00
|
|
|
; SSE41-NEXT: movaps %xmm3, %xmm1
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: blend_shufflevector_4xi64:
|
|
|
|
; AVX1: # BB#0: # %entry
|
|
|
|
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3]
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: blend_shufflevector_4xi64:
|
|
|
|
; AVX2: # BB#0: # %entry
|
|
|
|
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
|
|
|
|
; AVX2-NEXT: retq
|
2014-10-02 14:52:19 +08:00
|
|
|
entry:
|
|
|
|
%select = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
|
|
|
|
ret <4 x i64> %select
|
2014-05-30 06:04:42 +08:00
|
|
|
}
|
2016-02-17 06:13:59 +08:00
|
|
|
|
|
|
|
define <4 x i32> @blend_logic_v4i32(<4 x i32> %b, <4 x i32> %a, <4 x i32> %c) {
|
|
|
|
; SSE2-LABEL: blend_logic_v4i32:
|
|
|
|
; SSE2: # BB#0: # %entry
|
|
|
|
; SSE2-NEXT: psrad $31, %xmm0
|
|
|
|
; SSE2-NEXT: pand %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: pandn %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: por %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: blend_logic_v4i32:
|
|
|
|
; SSSE3: # BB#0: # %entry
|
|
|
|
; SSSE3-NEXT: psrad $31, %xmm0
|
|
|
|
; SSSE3-NEXT: pand %xmm0, %xmm1
|
|
|
|
; SSSE3-NEXT: pandn %xmm2, %xmm0
|
|
|
|
; SSSE3-NEXT: por %xmm1, %xmm0
|
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: blend_logic_v4i32:
|
|
|
|
; SSE41: # BB#0: # %entry
|
|
|
|
; SSE41-NEXT: psrad $31, %xmm0
|
|
|
|
; SSE41-NEXT: pblendvb %xmm1, %xmm2
|
|
|
|
; SSE41-NEXT: movdqa %xmm2, %xmm0
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: blend_logic_v4i32:
|
|
|
|
; AVX: # BB#0: # %entry
|
|
|
|
; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
entry:
|
|
|
|
%b.lobit = ashr <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
|
|
|
|
%sub = sub nsw <4 x i32> zeroinitializer, %a
|
|
|
|
%0 = xor <4 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
|
|
|
|
%1 = and <4 x i32> %c, %0
|
|
|
|
%2 = and <4 x i32> %a, %b.lobit
|
|
|
|
%cond = or <4 x i32> %1, %2
|
|
|
|
ret <4 x i32> %cond
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) {
|
|
|
|
; SSE2-LABEL: blend_logic_v8i32:
|
|
|
|
; SSE2: # BB#0: # %entry
|
|
|
|
; SSE2-NEXT: psrad $31, %xmm0
|
|
|
|
; SSE2-NEXT: psrad $31, %xmm1
|
|
|
|
; SSE2-NEXT: pand %xmm1, %xmm3
|
|
|
|
; SSE2-NEXT: pandn %xmm5, %xmm1
|
|
|
|
; SSE2-NEXT: pand %xmm0, %xmm2
|
|
|
|
; SSE2-NEXT: pandn %xmm4, %xmm0
|
|
|
|
; SSE2-NEXT: por %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: por %xmm3, %xmm1
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: blend_logic_v8i32:
|
|
|
|
; SSSE3: # BB#0: # %entry
|
|
|
|
; SSSE3-NEXT: psrad $31, %xmm0
|
|
|
|
; SSSE3-NEXT: psrad $31, %xmm1
|
|
|
|
; SSSE3-NEXT: pand %xmm1, %xmm3
|
|
|
|
; SSSE3-NEXT: pandn %xmm5, %xmm1
|
|
|
|
; SSSE3-NEXT: pand %xmm0, %xmm2
|
|
|
|
; SSSE3-NEXT: pandn %xmm4, %xmm0
|
|
|
|
; SSSE3-NEXT: por %xmm2, %xmm0
|
|
|
|
; SSSE3-NEXT: por %xmm3, %xmm1
|
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: blend_logic_v8i32:
|
|
|
|
; SSE41: # BB#0: # %entry
|
|
|
|
; SSE41-NEXT: psrad $31, %xmm1
|
|
|
|
; SSE41-NEXT: psrad $31, %xmm0
|
|
|
|
; SSE41-NEXT: pblendvb %xmm2, %xmm4
|
|
|
|
; SSE41-NEXT: movdqa %xmm1, %xmm0
|
|
|
|
; SSE41-NEXT: pblendvb %xmm3, %xmm5
|
|
|
|
; SSE41-NEXT: movdqa %xmm4, %xmm0
|
|
|
|
; SSE41-NEXT: movdqa %xmm5, %xmm1
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: blend_logic_v8i32:
|
|
|
|
; AVX1: # BB#0: # %entry
|
|
|
|
; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
|
|
|
|
; AVX1-NEXT: vandnps %ymm2, %ymm0, %ymm2
|
|
|
|
; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0
|
|
|
|
; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: blend_logic_v8i32:
|
|
|
|
; AVX2: # BB#0: # %entry
|
|
|
|
; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
entry:
|
|
|
|
%b.lobit = ashr <8 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
|
|
|
|
%sub = sub nsw <8 x i32> zeroinitializer, %a
|
|
|
|
%0 = xor <8 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
|
|
|
|
%1 = and <8 x i32> %c, %0
|
|
|
|
%2 = and <8 x i32> %a, %b.lobit
|
|
|
|
%cond = or <8 x i32> %1, %2
|
|
|
|
ret <8 x i32> %cond
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @blend_neg_logic_v4i32(<4 x i32> %a, <4 x i32> %b) {
|
|
|
|
; SSE2-LABEL: blend_neg_logic_v4i32:
|
|
|
|
; SSE2: # BB#0: # %entry
|
|
|
|
; SSE2-NEXT: psrad $31, %xmm1
|
[X86] Don't turn (c?-v:v) into (c?-v:0) by blindly using PSIGN.
Currently, we sometimes miscompile this vector pattern:
(c ? -v : v)
We lower it to (because "c" is <4 x i1>, lowered as a vector mask):
(~c & v) | (c & -v)
When we have SSSE3, we incorrectly lower that to PSIGN, which does:
(c < 0 ? -v : c > 0 ? v : 0)
in other words, when c is either all-ones or all-zero:
(c ? -v : 0)
While this is an old bug, it rarely triggers because the PSIGN combine
is too sensitive to operand order. This will be improved separately.
Note that the PSIGN tests are also incorrect. Consider:
%b.lobit = ashr <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
%sub = sub nsw <4 x i32> zeroinitializer, %a
%0 = xor <4 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
%1 = and <4 x i32> %a, %0
%2 = and <4 x i32> %b.lobit, %sub
%cond = or <4 x i32> %1, %2
ret <4 x i32> %cond
if %b is zero:
%b.lobit = <4 x i32> zeroinitializer
%sub = sub nsw <4 x i32> zeroinitializer, %a
%0 = <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
%1 = <4 x i32> %a
%2 = <4 x i32> zeroinitializer
%cond = or <4 x i32> %a, zeroinitializer
ret <4 x i32> %a
whereas we currently generate:
psignd %xmm1, %xmm0
retq
which returns 0, as %xmm1 is 0.
Instead, use a pure logic sequence, as described in:
https://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
Fixes PR26110.
Differential Revision: http://reviews.llvm.org/D17181
llvm-svn: 261023
2016-02-17 06:14:03 +08:00
|
|
|
; SSE2-NEXT: pxor %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: psubd %xmm1, %xmm0
|
2016-02-17 06:13:59 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: blend_neg_logic_v4i32:
|
|
|
|
; SSSE3: # BB#0: # %entry
|
[X86] Don't turn (c?-v:v) into (c?-v:0) by blindly using PSIGN.
Currently, we sometimes miscompile this vector pattern:
(c ? -v : v)
We lower it to (because "c" is <4 x i1>, lowered as a vector mask):
(~c & v) | (c & -v)
When we have SSSE3, we incorrectly lower that to PSIGN, which does:
(c < 0 ? -v : c > 0 ? v : 0)
in other words, when c is either all-ones or all-zero:
(c ? -v : 0)
While this is an old bug, it rarely triggers because the PSIGN combine
is too sensitive to operand order. This will be improved separately.
Note that the PSIGN tests are also incorrect. Consider:
%b.lobit = ashr <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
%sub = sub nsw <4 x i32> zeroinitializer, %a
%0 = xor <4 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
%1 = and <4 x i32> %a, %0
%2 = and <4 x i32> %b.lobit, %sub
%cond = or <4 x i32> %1, %2
ret <4 x i32> %cond
if %b is zero:
%b.lobit = <4 x i32> zeroinitializer
%sub = sub nsw <4 x i32> zeroinitializer, %a
%0 = <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
%1 = <4 x i32> %a
%2 = <4 x i32> zeroinitializer
%cond = or <4 x i32> %a, zeroinitializer
ret <4 x i32> %a
whereas we currently generate:
psignd %xmm1, %xmm0
retq
which returns 0, as %xmm1 is 0.
Instead, use a pure logic sequence, as described in:
https://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
Fixes PR26110.
Differential Revision: http://reviews.llvm.org/D17181
llvm-svn: 261023
2016-02-17 06:14:03 +08:00
|
|
|
; SSSE3-NEXT: psrad $31, %xmm1
|
|
|
|
; SSSE3-NEXT: pxor %xmm1, %xmm0
|
|
|
|
; SSSE3-NEXT: psubd %xmm1, %xmm0
|
2016-02-17 06:13:59 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: blend_neg_logic_v4i32:
|
|
|
|
; SSE41: # BB#0: # %entry
|
[X86] Don't turn (c?-v:v) into (c?-v:0) by blindly using PSIGN.
Currently, we sometimes miscompile this vector pattern:
(c ? -v : v)
We lower it to (because "c" is <4 x i1>, lowered as a vector mask):
(~c & v) | (c & -v)
When we have SSSE3, we incorrectly lower that to PSIGN, which does:
(c < 0 ? -v : c > 0 ? v : 0)
in other words, when c is either all-ones or all-zero:
(c ? -v : 0)
While this is an old bug, it rarely triggers because the PSIGN combine
is too sensitive to operand order. This will be improved separately.
Note that the PSIGN tests are also incorrect. Consider:
%b.lobit = ashr <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
%sub = sub nsw <4 x i32> zeroinitializer, %a
%0 = xor <4 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
%1 = and <4 x i32> %a, %0
%2 = and <4 x i32> %b.lobit, %sub
%cond = or <4 x i32> %1, %2
ret <4 x i32> %cond
if %b is zero:
%b.lobit = <4 x i32> zeroinitializer
%sub = sub nsw <4 x i32> zeroinitializer, %a
%0 = <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
%1 = <4 x i32> %a
%2 = <4 x i32> zeroinitializer
%cond = or <4 x i32> %a, zeroinitializer
ret <4 x i32> %a
whereas we currently generate:
psignd %xmm1, %xmm0
retq
which returns 0, as %xmm1 is 0.
Instead, use a pure logic sequence, as described in:
https://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
Fixes PR26110.
Differential Revision: http://reviews.llvm.org/D17181
llvm-svn: 261023
2016-02-17 06:14:03 +08:00
|
|
|
; SSE41-NEXT: psrad $31, %xmm1
|
|
|
|
; SSE41-NEXT: pxor %xmm1, %xmm0
|
|
|
|
; SSE41-NEXT: psubd %xmm1, %xmm0
|
2016-02-17 06:13:59 +08:00
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: blend_neg_logic_v4i32:
|
|
|
|
; AVX: # BB#0: # %entry
|
[X86] Don't turn (c?-v:v) into (c?-v:0) by blindly using PSIGN.
Currently, we sometimes miscompile this vector pattern:
(c ? -v : v)
We lower it to (because "c" is <4 x i1>, lowered as a vector mask):
(~c & v) | (c & -v)
When we have SSSE3, we incorrectly lower that to PSIGN, which does:
(c < 0 ? -v : c > 0 ? v : 0)
in other words, when c is either all-ones or all-zero:
(c ? -v : 0)
While this is an old bug, it rarely triggers because the PSIGN combine
is too sensitive to operand order. This will be improved separately.
Note that the PSIGN tests are also incorrect. Consider:
%b.lobit = ashr <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
%sub = sub nsw <4 x i32> zeroinitializer, %a
%0 = xor <4 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
%1 = and <4 x i32> %a, %0
%2 = and <4 x i32> %b.lobit, %sub
%cond = or <4 x i32> %1, %2
ret <4 x i32> %cond
if %b is zero:
%b.lobit = <4 x i32> zeroinitializer
%sub = sub nsw <4 x i32> zeroinitializer, %a
%0 = <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
%1 = <4 x i32> %a
%2 = <4 x i32> zeroinitializer
%cond = or <4 x i32> %a, zeroinitializer
ret <4 x i32> %a
whereas we currently generate:
psignd %xmm1, %xmm0
retq
which returns 0, as %xmm1 is 0.
Instead, use a pure logic sequence, as described in:
https://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
Fixes PR26110.
Differential Revision: http://reviews.llvm.org/D17181
llvm-svn: 261023
2016-02-17 06:14:03 +08:00
|
|
|
; AVX-NEXT: vpsrad $31, %xmm1, %xmm1
|
|
|
|
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
|
2016-02-17 06:13:59 +08:00
|
|
|
; AVX-NEXT: retq
|
|
|
|
entry:
|
|
|
|
%b.lobit = ashr <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
|
|
|
|
%sub = sub nsw <4 x i32> zeroinitializer, %a
|
|
|
|
%0 = xor <4 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
|
|
|
|
%1 = and <4 x i32> %a, %0
|
|
|
|
%2 = and <4 x i32> %b.lobit, %sub
|
|
|
|
%cond = or <4 x i32> %1, %2
|
|
|
|
ret <4 x i32> %cond
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i32> @blend_neg_logic_v8i32(<8 x i32> %a, <8 x i32> %b) {
|
|
|
|
; SSE2-LABEL: blend_neg_logic_v8i32:
|
|
|
|
; SSE2: # BB#0: # %entry
|
|
|
|
; SSE2-NEXT: psrad $31, %xmm3
|
[X86] Don't turn (c?-v:v) into (c?-v:0) by blindly using PSIGN.
Currently, we sometimes miscompile this vector pattern:
(c ? -v : v)
We lower it to (because "c" is <4 x i1>, lowered as a vector mask):
(~c & v) | (c & -v)
When we have SSSE3, we incorrectly lower that to PSIGN, which does:
(c < 0 ? -v : c > 0 ? v : 0)
in other words, when c is either all-ones or all-zero:
(c ? -v : 0)
While this is an old bug, it rarely triggers because the PSIGN combine
is too sensitive to operand order. This will be improved separately.
Note that the PSIGN tests are also incorrect. Consider:
%b.lobit = ashr <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
%sub = sub nsw <4 x i32> zeroinitializer, %a
%0 = xor <4 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
%1 = and <4 x i32> %a, %0
%2 = and <4 x i32> %b.lobit, %sub
%cond = or <4 x i32> %1, %2
ret <4 x i32> %cond
if %b is zero:
%b.lobit = <4 x i32> zeroinitializer
%sub = sub nsw <4 x i32> zeroinitializer, %a
%0 = <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
%1 = <4 x i32> %a
%2 = <4 x i32> zeroinitializer
%cond = or <4 x i32> %a, zeroinitializer
ret <4 x i32> %a
whereas we currently generate:
psignd %xmm1, %xmm0
retq
which returns 0, as %xmm1 is 0.
Instead, use a pure logic sequence, as described in:
https://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
Fixes PR26110.
Differential Revision: http://reviews.llvm.org/D17181
llvm-svn: 261023
2016-02-17 06:14:03 +08:00
|
|
|
; SSE2-NEXT: psrad $31, %xmm2
|
|
|
|
; SSE2-NEXT: pxor %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: psubd %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: pxor %xmm3, %xmm1
|
|
|
|
; SSE2-NEXT: psubd %xmm3, %xmm1
|
2016-02-17 06:13:59 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: blend_neg_logic_v8i32:
|
|
|
|
; SSSE3: # BB#0: # %entry
|
[X86] Don't turn (c?-v:v) into (c?-v:0) by blindly using PSIGN.
Currently, we sometimes miscompile this vector pattern:
(c ? -v : v)
We lower it to (because "c" is <4 x i1>, lowered as a vector mask):
(~c & v) | (c & -v)
When we have SSSE3, we incorrectly lower that to PSIGN, which does:
(c < 0 ? -v : c > 0 ? v : 0)
in other words, when c is either all-ones or all-zero:
(c ? -v : 0)
While this is an old bug, it rarely triggers because the PSIGN combine
is too sensitive to operand order. This will be improved separately.
Note that the PSIGN tests are also incorrect. Consider:
%b.lobit = ashr <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
%sub = sub nsw <4 x i32> zeroinitializer, %a
%0 = xor <4 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
%1 = and <4 x i32> %a, %0
%2 = and <4 x i32> %b.lobit, %sub
%cond = or <4 x i32> %1, %2
ret <4 x i32> %cond
if %b is zero:
%b.lobit = <4 x i32> zeroinitializer
%sub = sub nsw <4 x i32> zeroinitializer, %a
%0 = <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
%1 = <4 x i32> %a
%2 = <4 x i32> zeroinitializer
%cond = or <4 x i32> %a, zeroinitializer
ret <4 x i32> %a
whereas we currently generate:
psignd %xmm1, %xmm0
retq
which returns 0, as %xmm1 is 0.
Instead, use a pure logic sequence, as described in:
https://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
Fixes PR26110.
Differential Revision: http://reviews.llvm.org/D17181
llvm-svn: 261023
2016-02-17 06:14:03 +08:00
|
|
|
; SSSE3-NEXT: psrad $31, %xmm3
|
|
|
|
; SSSE3-NEXT: psrad $31, %xmm2
|
|
|
|
; SSSE3-NEXT: pxor %xmm2, %xmm0
|
|
|
|
; SSSE3-NEXT: psubd %xmm2, %xmm0
|
|
|
|
; SSSE3-NEXT: pxor %xmm3, %xmm1
|
|
|
|
; SSSE3-NEXT: psubd %xmm3, %xmm1
|
2016-02-17 06:13:59 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: blend_neg_logic_v8i32:
|
|
|
|
; SSE41: # BB#0: # %entry
|
[X86] Don't turn (c?-v:v) into (c?-v:0) by blindly using PSIGN.
Currently, we sometimes miscompile this vector pattern:
(c ? -v : v)
We lower it to (because "c" is <4 x i1>, lowered as a vector mask):
(~c & v) | (c & -v)
When we have SSSE3, we incorrectly lower that to PSIGN, which does:
(c < 0 ? -v : c > 0 ? v : 0)
in other words, when c is either all-ones or all-zero:
(c ? -v : 0)
While this is an old bug, it rarely triggers because the PSIGN combine
is too sensitive to operand order. This will be improved separately.
Note that the PSIGN tests are also incorrect. Consider:
%b.lobit = ashr <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
%sub = sub nsw <4 x i32> zeroinitializer, %a
%0 = xor <4 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
%1 = and <4 x i32> %a, %0
%2 = and <4 x i32> %b.lobit, %sub
%cond = or <4 x i32> %1, %2
ret <4 x i32> %cond
if %b is zero:
%b.lobit = <4 x i32> zeroinitializer
%sub = sub nsw <4 x i32> zeroinitializer, %a
%0 = <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
%1 = <4 x i32> %a
%2 = <4 x i32> zeroinitializer
%cond = or <4 x i32> %a, zeroinitializer
ret <4 x i32> %a
whereas we currently generate:
psignd %xmm1, %xmm0
retq
which returns 0, as %xmm1 is 0.
Instead, use a pure logic sequence, as described in:
https://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
Fixes PR26110.
Differential Revision: http://reviews.llvm.org/D17181
llvm-svn: 261023
2016-02-17 06:14:03 +08:00
|
|
|
; SSE41-NEXT: psrad $31, %xmm3
|
|
|
|
; SSE41-NEXT: psrad $31, %xmm2
|
|
|
|
; SSE41-NEXT: pxor %xmm2, %xmm0
|
|
|
|
; SSE41-NEXT: psubd %xmm2, %xmm0
|
|
|
|
; SSE41-NEXT: pxor %xmm3, %xmm1
|
|
|
|
; SSE41-NEXT: psubd %xmm3, %xmm1
|
2016-02-17 06:13:59 +08:00
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: blend_neg_logic_v8i32:
|
|
|
|
; AVX1: # BB#0: # %entry
|
|
|
|
; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
|
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
|
|
|
|
; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
|
|
|
|
; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0
|
|
|
|
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
|
|
|
|
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: blend_neg_logic_v8i32:
|
|
|
|
; AVX2: # BB#0: # %entry
|
[X86] Don't turn (c?-v:v) into (c?-v:0) by blindly using PSIGN.
Currently, we sometimes miscompile this vector pattern:
(c ? -v : v)
We lower it to (because "c" is <4 x i1>, lowered as a vector mask):
(~c & v) | (c & -v)
When we have SSSE3, we incorrectly lower that to PSIGN, which does:
(c < 0 ? -v : c > 0 ? v : 0)
in other words, when c is either all-ones or all-zero:
(c ? -v : 0)
While this is an old bug, it rarely triggers because the PSIGN combine
is too sensitive to operand order. This will be improved separately.
Note that the PSIGN tests are also incorrect. Consider:
%b.lobit = ashr <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
%sub = sub nsw <4 x i32> zeroinitializer, %a
%0 = xor <4 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
%1 = and <4 x i32> %a, %0
%2 = and <4 x i32> %b.lobit, %sub
%cond = or <4 x i32> %1, %2
ret <4 x i32> %cond
if %b is zero:
%b.lobit = <4 x i32> zeroinitializer
%sub = sub nsw <4 x i32> zeroinitializer, %a
%0 = <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
%1 = <4 x i32> %a
%2 = <4 x i32> zeroinitializer
%cond = or <4 x i32> %a, zeroinitializer
ret <4 x i32> %a
whereas we currently generate:
psignd %xmm1, %xmm0
retq
which returns 0, as %xmm1 is 0.
Instead, use a pure logic sequence, as described in:
https://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
Fixes PR26110.
Differential Revision: http://reviews.llvm.org/D17181
llvm-svn: 261023
2016-02-17 06:14:03 +08:00
|
|
|
; AVX2-NEXT: vpsrad $31, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
|
2016-02-17 06:13:59 +08:00
|
|
|
; AVX2-NEXT: retq
|
|
|
|
entry:
|
|
|
|
%b.lobit = ashr <8 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
|
|
|
|
%sub = sub nsw <8 x i32> zeroinitializer, %a
|
|
|
|
%0 = xor <8 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
|
|
|
|
%1 = and <8 x i32> %a, %0
|
|
|
|
%2 = and <8 x i32> %b.lobit, %sub
|
|
|
|
%cond = or <8 x i32> %1, %2
|
|
|
|
ret <8 x i32> %cond
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @blend_neg_logic_v4i32_2(<4 x i32> %v, <4 x i32> %c) {
|
|
|
|
; SSE2-LABEL: blend_neg_logic_v4i32_2:
|
|
|
|
; SSE2: # BB#0: # %entry
|
|
|
|
; SSE2-NEXT: psrld $31, %xmm1
|
|
|
|
; SSE2-NEXT: pslld $31, %xmm1
|
|
|
|
; SSE2-NEXT: psrad $31, %xmm1
|
2016-02-17 06:14:07 +08:00
|
|
|
; SSE2-NEXT: pxor %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: psubd %xmm1, %xmm0
|
2016-02-17 06:13:59 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: blend_neg_logic_v4i32_2:
|
|
|
|
; SSSE3: # BB#0: # %entry
|
|
|
|
; SSSE3-NEXT: psrld $31, %xmm1
|
|
|
|
; SSSE3-NEXT: pslld $31, %xmm1
|
|
|
|
; SSSE3-NEXT: psrad $31, %xmm1
|
2016-02-17 06:14:07 +08:00
|
|
|
; SSSE3-NEXT: pxor %xmm1, %xmm0
|
|
|
|
; SSSE3-NEXT: psubd %xmm1, %xmm0
|
2016-02-17 06:13:59 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: blend_neg_logic_v4i32_2:
|
|
|
|
; SSE41: # BB#0: # %entry
|
|
|
|
; SSE41-NEXT: movdqa %xmm0, %xmm2
|
|
|
|
; SSE41-NEXT: psrld $31, %xmm1
|
|
|
|
; SSE41-NEXT: pslld $31, %xmm1
|
|
|
|
; SSE41-NEXT: pxor %xmm3, %xmm3
|
|
|
|
; SSE41-NEXT: psubd %xmm2, %xmm3
|
|
|
|
; SSE41-NEXT: movdqa %xmm1, %xmm0
|
|
|
|
; SSE41-NEXT: blendvps %xmm2, %xmm3
|
|
|
|
; SSE41-NEXT: movaps %xmm3, %xmm0
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: blend_neg_logic_v4i32_2:
|
|
|
|
; AVX: # BB#0: # %entry
|
|
|
|
; AVX-NEXT: vpsrld $31, %xmm1, %xmm1
|
|
|
|
; AVX-NEXT: vpslld $31, %xmm1, %xmm1
|
|
|
|
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
|
|
|
; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm2
|
|
|
|
; AVX-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
entry:
|
|
|
|
%0 = ashr <4 x i32> %c, <i32 31, i32 31, i32 31, i32 31>
|
|
|
|
%1 = trunc <4 x i32> %0 to <4 x i1>
|
|
|
|
%2 = sub nsw <4 x i32> zeroinitializer, %v
|
|
|
|
%3 = select <4 x i1> %1, <4 x i32> %v, <4 x i32> %2
|
|
|
|
ret <4 x i32> %3
|
|
|
|
}
|