2016-10-09 02:51:55 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
2015-09-05 18:19:07 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
|
2016-04-20 13:19:01 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL
|
2016-04-22 11:22:38 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD
|
2017-05-25 21:45:23 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ
|
2016-10-21 18:50:52 +08:00
|
|
|
;
|
|
|
|
; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt.
|
|
|
|
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2
|
2015-06-08 05:01:34 +08:00
|
|
|
|
2015-07-20 01:09:43 +08:00
|
|
|
define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
|
2015-06-08 05:01:34 +08:00
|
|
|
; AVX1-LABEL: testv4i64:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
|
|
|
|
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
|
|
|
|
; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1
|
2015-11-25 03:51:26 +08:00
|
|
|
; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm5
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3
|
|
|
|
; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
|
|
|
|
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
|
|
|
|
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
|
2015-11-25 03:51:26 +08:00
|
|
|
; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0
|
2015-06-08 05:01:34 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: testv4i64:
|
|
|
|
; AVX2: # BB#0:
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm2
|
|
|
|
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
|
|
|
|
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0
|
|
|
|
; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
|
2015-11-25 03:51:26 +08:00
|
|
|
; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
|
2015-06-08 05:01:34 +08:00
|
|
|
; AVX2-NEXT: retq
|
2016-04-20 13:19:01 +08:00
|
|
|
;
|
|
|
|
; AVX512CDVL-LABEL: testv4i64:
|
|
|
|
; AVX512CDVL: # BB#0:
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
2016-04-20 13:19:01 +08:00
|
|
|
; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm2
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vpand %ymm2, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX512CDVL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
|
|
|
|
; AVX512CDVL-NEXT: vpaddq %ymm2, %ymm0, %ymm0
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX512CDVL-NEXT: vpand %ymm2, %ymm0, %ymm3
|
|
|
|
; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
2016-04-20 13:19:01 +08:00
|
|
|
; AVX512CDVL-NEXT: vpshufb %ymm3, %ymm4, %ymm3
|
|
|
|
; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vpand %ymm2, %ymm0, %ymm0
|
2016-04-20 13:19:01 +08:00
|
|
|
; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm4, %ymm0
|
|
|
|
; AVX512CDVL-NEXT: vpaddb %ymm3, %ymm0, %ymm0
|
|
|
|
; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512CDVL-NEXT: retq
|
2016-04-22 11:22:38 +08:00
|
|
|
;
|
|
|
|
; AVX512CD-LABEL: testv4i64:
|
|
|
|
; AVX512CD: # BB#0:
|
|
|
|
; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm2
|
|
|
|
; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
|
|
|
|
; AVX512CD-NEXT: vpaddq %ymm2, %ymm0, %ymm0
|
2016-04-22 11:22:38 +08:00
|
|
|
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm3
|
|
|
|
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3
|
|
|
|
; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: retq
|
2016-10-21 18:50:52 +08:00
|
|
|
;
|
2017-05-25 21:45:23 +08:00
|
|
|
; AVX512VPOPCNTDQ-LABEL: testv4i64:
|
|
|
|
; AVX512VPOPCNTDQ: # BB#0:
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm0, %ymm1, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpaddq %ymm1, %ymm0, %ymm0
|
2017-05-25 21:45:23 +08:00
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: retq
|
|
|
|
;
|
2016-10-21 18:50:52 +08:00
|
|
|
; X32-AVX-LABEL: testv4i64:
|
|
|
|
; X32-AVX: # BB#0:
|
|
|
|
; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; X32-AVX-NEXT: vpsubq %ymm0, %ymm1, %ymm2
|
|
|
|
; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpsubq {{\.LCPI.*}}, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm3
|
|
|
|
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; X32-AVX-NEXT: vpshufb %ymm3, %ymm4, %ymm3
|
|
|
|
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpshufb %ymm0, %ymm4, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpaddb %ymm3, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: retl
|
2015-06-08 05:01:34 +08:00
|
|
|
%out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 0)
|
|
|
|
ret <4 x i64> %out
|
|
|
|
}
|
|
|
|
|
2015-07-20 01:09:43 +08:00
|
|
|
define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
|
2015-06-08 05:01:34 +08:00
|
|
|
; AVX1-LABEL: testv4i64u:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
|
|
|
|
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
|
|
|
|
; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1
|
2015-11-25 03:51:26 +08:00
|
|
|
; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm5
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3
|
|
|
|
; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
|
|
|
|
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
|
|
|
|
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
|
2015-11-25 03:51:26 +08:00
|
|
|
; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0
|
2015-06-08 05:01:34 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: testv4i64u:
|
|
|
|
; AVX2: # BB#0:
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm2
|
|
|
|
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
|
|
|
|
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0
|
|
|
|
; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
|
2015-11-25 03:51:26 +08:00
|
|
|
; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
|
2015-06-08 05:01:34 +08:00
|
|
|
; AVX2-NEXT: retq
|
2016-04-20 13:19:01 +08:00
|
|
|
;
|
|
|
|
; AVX512CDVL-LABEL: testv4i64u:
|
|
|
|
; AVX512CDVL: # BB#0:
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
2016-04-20 13:19:01 +08:00
|
|
|
; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm1
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0
|
2016-04-20 13:19:01 +08:00
|
|
|
; AVX512CDVL-NEXT: vplzcntq %ymm0, %ymm0
|
|
|
|
; AVX512CDVL-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
|
|
|
|
; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm0
|
|
|
|
; AVX512CDVL-NEXT: retq
|
2016-04-22 11:22:38 +08:00
|
|
|
;
|
|
|
|
; AVX512CD-LABEL: testv4i64u:
|
|
|
|
; AVX512CD: # BB#0:
|
|
|
|
; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm1
|
|
|
|
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
|
|
|
|
; AVX512CD-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
|
|
|
|
; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm0
|
|
|
|
; AVX512CD-NEXT: retq
|
2016-10-21 18:50:52 +08:00
|
|
|
;
|
2017-05-25 21:45:23 +08:00
|
|
|
; AVX512VPOPCNTDQ-LABEL: testv4i64u:
|
|
|
|
; AVX512VPOPCNTDQ: # BB#0:
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm0, %ymm1, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpaddq %ymm1, %ymm0, %ymm0
|
2017-05-25 21:45:23 +08:00
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: retq
|
|
|
|
;
|
2016-10-21 18:50:52 +08:00
|
|
|
; X32-AVX-LABEL: testv4i64u:
|
|
|
|
; X32-AVX: # BB#0:
|
|
|
|
; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; X32-AVX-NEXT: vpsubq %ymm0, %ymm1, %ymm2
|
|
|
|
; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpsubq {{\.LCPI.*}}, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm3
|
|
|
|
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; X32-AVX-NEXT: vpshufb %ymm3, %ymm4, %ymm3
|
|
|
|
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpshufb %ymm0, %ymm4, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpaddb %ymm3, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: retl
|
2015-06-08 05:01:34 +08:00
|
|
|
%out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 -1)
|
|
|
|
ret <4 x i64> %out
|
|
|
|
}
|
|
|
|
|
2015-07-20 01:09:43 +08:00
|
|
|
define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
|
2015-06-08 05:01:34 +08:00
|
|
|
; AVX1-LABEL: testv8i32:
|
|
|
|
; AVX1: # BB#0:
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm3
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
|
|
|
|
; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
|
|
|
|
; AVX1-NEXT: vpsadbw %xmm2, %xmm5, %xmm5
|
|
|
|
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
|
|
|
|
; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm5
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3
|
|
|
|
; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
|
|
|
|
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
|
|
|
|
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
|
|
|
|
; AVX1-NEXT: vpsadbw %xmm2, %xmm3, %xmm3
|
2016-11-29 22:18:51 +08:00
|
|
|
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
2015-06-08 05:01:34 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: testv8i32:
|
|
|
|
; AVX2: # BB#0:
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm2
|
|
|
|
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
|
|
|
|
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0
|
|
|
|
; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
|
2015-11-25 03:51:26 +08:00
|
|
|
; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
2015-11-25 03:51:26 +08:00
|
|
|
; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
|
2015-06-08 05:01:34 +08:00
|
|
|
; AVX2-NEXT: retq
|
2016-04-20 13:19:01 +08:00
|
|
|
;
|
|
|
|
; AVX512CDVL-LABEL: testv8i32:
|
|
|
|
; AVX512CDVL: # BB#0:
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
2016-04-20 13:19:01 +08:00
|
|
|
; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm2
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vpand %ymm2, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX512CDVL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
|
|
|
|
; AVX512CDVL-NEXT: vpaddd %ymm2, %ymm0, %ymm0
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX512CDVL-NEXT: vpand %ymm2, %ymm0, %ymm3
|
|
|
|
; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
2016-04-20 13:19:01 +08:00
|
|
|
; AVX512CDVL-NEXT: vpshufb %ymm3, %ymm4, %ymm3
|
|
|
|
; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vpand %ymm2, %ymm0, %ymm0
|
2016-04-20 13:19:01 +08:00
|
|
|
; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm4, %ymm0
|
|
|
|
; AVX512CDVL-NEXT: vpaddb %ymm3, %ymm0, %ymm0
|
|
|
|
; AVX512CDVL-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
|
|
|
|
; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
|
|
|
|
; AVX512CDVL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
|
|
|
; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512CDVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX512CDVL-NEXT: retq
|
2016-04-22 11:22:38 +08:00
|
|
|
;
|
|
|
|
; AVX512CD-LABEL: testv8i32:
|
|
|
|
; AVX512CD: # BB#0:
|
|
|
|
; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm2
|
|
|
|
; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
|
|
|
|
; AVX512CD-NEXT: vpaddd %ymm2, %ymm0, %ymm0
|
2016-04-22 11:22:38 +08:00
|
|
|
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm3
|
|
|
|
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3
|
|
|
|
; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
|
|
|
|
; AVX512CD-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
|
|
|
|
; AVX512CD-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
|
|
|
; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: retq
|
2016-10-21 18:50:52 +08:00
|
|
|
;
|
2017-05-25 21:45:23 +08:00
|
|
|
; AVX512VPOPCNTDQ-LABEL: testv8i32:
|
|
|
|
; AVX512VPOPCNTDQ: # BB#0:
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm0, %ymm1, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpaddd %ymm1, %ymm0, %ymm0
|
2017-05-25 21:45:23 +08:00
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: retq
|
|
|
|
;
|
2016-10-21 18:50:52 +08:00
|
|
|
; X32-AVX-LABEL: testv8i32:
|
|
|
|
; X32-AVX: # BB#0:
|
|
|
|
; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; X32-AVX-NEXT: vpsubd %ymm0, %ymm1, %ymm2
|
|
|
|
; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; X32-AVX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
|
|
|
|
; X32-AVX-NEXT: vpaddd %ymm2, %ymm0, %ymm0
|
2016-10-21 18:50:52 +08:00
|
|
|
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm3
|
|
|
|
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; X32-AVX-NEXT: vpshufb %ymm3, %ymm4, %ymm3
|
|
|
|
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpshufb %ymm0, %ymm4, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpaddb %ymm3, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
|
|
|
|
; X32-AVX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
|
|
|
|
; X32-AVX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
|
|
|
; X32-AVX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: retl
|
2015-06-08 05:01:34 +08:00
|
|
|
%out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 0)
|
|
|
|
ret <8 x i32> %out
|
|
|
|
}
|
|
|
|
|
2015-07-20 01:09:43 +08:00
|
|
|
define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
|
2015-06-08 05:01:34 +08:00
|
|
|
; AVX1-LABEL: testv8i32u:
|
|
|
|
; AVX1: # BB#0:
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm3
|
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
|
|
|
|
; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
|
|
|
|
; AVX1-NEXT: vpsadbw %xmm2, %xmm5, %xmm5
|
|
|
|
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
|
|
|
|
; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm5
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3
|
|
|
|
; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
|
|
|
|
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
|
|
|
|
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
|
|
|
|
; AVX1-NEXT: vpsadbw %xmm2, %xmm3, %xmm3
|
2016-11-29 22:18:51 +08:00
|
|
|
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
2015-06-08 05:01:34 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: testv8i32u:
|
|
|
|
; AVX2: # BB#0:
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm2
|
|
|
|
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
|
|
|
|
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0
|
|
|
|
; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
|
2015-11-25 03:51:26 +08:00
|
|
|
; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
2015-11-25 03:51:26 +08:00
|
|
|
; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
|
2015-06-08 05:01:34 +08:00
|
|
|
; AVX2-NEXT: retq
|
2016-04-20 13:19:01 +08:00
|
|
|
;
|
|
|
|
; AVX512CDVL-LABEL: testv8i32u:
|
|
|
|
; AVX512CDVL: # BB#0:
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
2016-04-20 13:19:01 +08:00
|
|
|
; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm1
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0
|
2016-04-20 13:19:01 +08:00
|
|
|
; AVX512CDVL-NEXT: vplzcntd %ymm0, %ymm0
|
|
|
|
; AVX512CDVL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
|
|
|
|
; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm0
|
|
|
|
; AVX512CDVL-NEXT: retq
|
2016-04-22 11:22:38 +08:00
|
|
|
;
|
|
|
|
; AVX512CD-LABEL: testv8i32u:
|
|
|
|
; AVX512CD: # BB#0:
|
|
|
|
; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm1
|
|
|
|
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
|
|
|
|
; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
|
|
|
|
; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm0
|
|
|
|
; AVX512CD-NEXT: retq
|
2016-10-21 18:50:52 +08:00
|
|
|
;
|
2017-05-25 21:45:23 +08:00
|
|
|
; AVX512VPOPCNTDQ-LABEL: testv8i32u:
|
|
|
|
; AVX512VPOPCNTDQ: # BB#0:
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm0, %ymm1, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpaddd %ymm1, %ymm0, %ymm0
|
2017-05-25 21:45:23 +08:00
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: retq
|
|
|
|
;
|
2016-10-21 18:50:52 +08:00
|
|
|
; X32-AVX-LABEL: testv8i32u:
|
|
|
|
; X32-AVX: # BB#0:
|
|
|
|
; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; X32-AVX-NEXT: vpsubd %ymm0, %ymm1, %ymm2
|
|
|
|
; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; X32-AVX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
|
|
|
|
; X32-AVX-NEXT: vpaddd %ymm2, %ymm0, %ymm0
|
2016-10-21 18:50:52 +08:00
|
|
|
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm3
|
|
|
|
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; X32-AVX-NEXT: vpshufb %ymm3, %ymm4, %ymm3
|
|
|
|
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpshufb %ymm0, %ymm4, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpaddb %ymm3, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
|
|
|
|
; X32-AVX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
|
|
|
|
; X32-AVX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
|
|
|
; X32-AVX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: retl
|
2015-06-08 05:01:34 +08:00
|
|
|
%out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 -1)
|
|
|
|
ret <8 x i32> %out
|
|
|
|
}
|
|
|
|
|
2015-07-20 01:09:43 +08:00
|
|
|
define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
|
2015-06-08 05:01:34 +08:00
|
|
|
; AVX1-LABEL: testv16i16:
|
|
|
|
; AVX1: # BB#0:
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm2
|
|
|
|
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm2
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2
|
|
|
|
; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpsllw $8, %xmm2, %xmm5
|
|
|
|
; AVX1-NEXT: vpaddb %xmm2, %xmm5, %xmm2
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm0
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
|
2015-06-08 05:01:34 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: testv16i16:
|
|
|
|
; AVX2: # BB#0:
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
|
|
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
|
|
|
|
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1
|
|
|
|
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
|
|
|
|
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
|
2015-06-08 05:01:34 +08:00
|
|
|
; AVX2-NEXT: retq
|
2016-04-20 13:19:01 +08:00
|
|
|
;
|
|
|
|
; AVX512CDVL-LABEL: testv16i16:
|
|
|
|
; AVX512CDVL: # BB#0:
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
2016-04-20 13:19:01 +08:00
|
|
|
; AVX512CDVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512CDVL-NEXT: vpaddw %ymm1, %ymm0, %ymm0
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
|
|
; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
2016-04-20 13:19:01 +08:00
|
|
|
; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
|
|
; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0
|
2016-04-20 13:19:01 +08:00
|
|
|
; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
|
|
|
|
; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX512CDVL-NEXT: vpsllw $8, %ymm0, %ymm1
|
|
|
|
; AVX512CDVL-NEXT: vpaddb %ymm0, %ymm1, %ymm0
|
|
|
|
; AVX512CDVL-NEXT: vpsrlw $8, %ymm0, %ymm0
|
|
|
|
; AVX512CDVL-NEXT: retq
|
2016-04-22 11:22:38 +08:00
|
|
|
;
|
|
|
|
; AVX512CD-LABEL: testv16i16:
|
|
|
|
; AVX512CD: # BB#0:
|
|
|
|
; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512CD-NEXT: vpsubw %ymm0, %ymm1, %ymm1
|
|
|
|
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm0
|
2016-04-22 11:22:38 +08:00
|
|
|
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
|
|
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
|
|
; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm1
|
|
|
|
; AVX512CD-NEXT: vpaddb %ymm0, %ymm1, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: retq
|
2016-10-21 18:50:52 +08:00
|
|
|
;
|
2017-05-25 21:45:23 +08:00
|
|
|
; AVX512VPOPCNTDQ-LABEL: testv16i16:
|
|
|
|
; AVX512VPOPCNTDQ: # BB#0:
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm1, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0
|
2017-05-25 21:45:23 +08:00
|
|
|
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: retq
|
|
|
|
;
|
2016-10-21 18:50:52 +08:00
|
|
|
; X32-AVX-LABEL: testv16i16:
|
|
|
|
; X32-AVX: # BB#0:
|
|
|
|
; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; X32-AVX-NEXT: vpsubw %ymm0, %ymm1, %ymm1
|
|
|
|
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; X32-AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0
|
2016-10-21 18:50:52 +08:00
|
|
|
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
|
|
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
|
|
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpaddb %ymm2, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpsllw $8, %ymm0, %ymm1
|
|
|
|
; X32-AVX-NEXT: vpaddb %ymm0, %ymm1, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpsrlw $8, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: retl
|
2015-06-08 05:01:34 +08:00
|
|
|
%out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 0)
|
|
|
|
ret <16 x i16> %out
|
|
|
|
}
|
|
|
|
|
2015-07-20 01:09:43 +08:00
|
|
|
define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
|
2015-06-08 05:01:34 +08:00
|
|
|
; AVX1-LABEL: testv16i16u:
|
|
|
|
; AVX1: # BB#0:
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm2
|
|
|
|
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm2
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2
|
|
|
|
; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpsllw $8, %xmm2, %xmm5
|
|
|
|
; AVX1-NEXT: vpaddb %xmm2, %xmm5, %xmm2
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm0
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
|
2015-06-08 05:01:34 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: testv16i16u:
|
|
|
|
; AVX2: # BB#0:
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
|
|
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
|
|
|
|
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1
|
|
|
|
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
|
|
|
|
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
|
2015-06-08 05:01:34 +08:00
|
|
|
; AVX2-NEXT: retq
|
2016-04-20 13:19:01 +08:00
|
|
|
;
|
|
|
|
; AVX512CDVL-LABEL: testv16i16u:
|
|
|
|
; AVX512CDVL: # BB#0:
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
2016-04-20 13:19:01 +08:00
|
|
|
; AVX512CDVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512CDVL-NEXT: vpaddw %ymm1, %ymm0, %ymm0
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
|
|
; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
2016-04-20 13:19:01 +08:00
|
|
|
; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
|
|
; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0
|
2016-04-20 13:19:01 +08:00
|
|
|
; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
|
|
|
|
; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX512CDVL-NEXT: vpsllw $8, %ymm0, %ymm1
|
|
|
|
; AVX512CDVL-NEXT: vpaddb %ymm0, %ymm1, %ymm0
|
|
|
|
; AVX512CDVL-NEXT: vpsrlw $8, %ymm0, %ymm0
|
|
|
|
; AVX512CDVL-NEXT: retq
|
2016-04-22 11:22:38 +08:00
|
|
|
;
|
|
|
|
; AVX512CD-LABEL: testv16i16u:
|
|
|
|
; AVX512CD: # BB#0:
|
|
|
|
; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512CD-NEXT: vpsubw %ymm0, %ymm1, %ymm1
|
|
|
|
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm0
|
2016-04-22 11:22:38 +08:00
|
|
|
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
|
|
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
|
|
; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm1
|
|
|
|
; AVX512CD-NEXT: vpaddb %ymm0, %ymm1, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: retq
|
2016-10-21 18:50:52 +08:00
|
|
|
;
|
2017-05-25 21:45:23 +08:00
|
|
|
; AVX512VPOPCNTDQ-LABEL: testv16i16u:
|
|
|
|
; AVX512VPOPCNTDQ: # BB#0:
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm1, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0
|
2017-05-25 21:45:23 +08:00
|
|
|
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: retq
|
|
|
|
;
|
2016-10-21 18:50:52 +08:00
|
|
|
; X32-AVX-LABEL: testv16i16u:
|
|
|
|
; X32-AVX: # BB#0:
|
|
|
|
; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; X32-AVX-NEXT: vpsubw %ymm0, %ymm1, %ymm1
|
|
|
|
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; X32-AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0
|
2016-10-21 18:50:52 +08:00
|
|
|
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
|
|
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
|
|
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpaddb %ymm2, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpsllw $8, %ymm0, %ymm1
|
|
|
|
; X32-AVX-NEXT: vpaddb %ymm0, %ymm1, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpsrlw $8, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: retl
|
2015-06-08 05:01:34 +08:00
|
|
|
%out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 -1)
|
|
|
|
ret <16 x i16> %out
|
|
|
|
}
|
|
|
|
|
2015-07-20 01:09:43 +08:00
|
|
|
define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
|
2015-06-08 05:01:34 +08:00
|
|
|
; AVX1-LABEL: testv32i8:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm3
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
|
|
|
|
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
|
|
|
|
; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm2
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
|
2015-06-08 05:01:34 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: testv32i8:
|
|
|
|
; AVX2: # BB#0:
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
|
|
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
|
|
|
|
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
|
2015-06-08 05:01:34 +08:00
|
|
|
; AVX2-NEXT: retq
|
2016-04-20 13:19:01 +08:00
|
|
|
;
|
|
|
|
; AVX512CDVL-LABEL: testv32i8:
|
|
|
|
; AVX512CDVL: # BB#0:
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
2016-04-20 13:19:01 +08:00
|
|
|
; AVX512CDVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512CDVL-NEXT: vpaddb %ymm1, %ymm0, %ymm0
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
|
|
; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
2016-04-20 13:19:01 +08:00
|
|
|
; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
|
|
; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0
|
2016-04-20 13:19:01 +08:00
|
|
|
; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
|
|
|
|
; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX512CDVL-NEXT: retq
|
2016-04-22 11:22:38 +08:00
|
|
|
;
|
|
|
|
; AVX512CD-LABEL: testv32i8:
|
|
|
|
; AVX512CD: # BB#0:
|
|
|
|
; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512CD-NEXT: vpsubb %ymm0, %ymm1, %ymm1
|
|
|
|
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm0
|
2016-04-22 11:22:38 +08:00
|
|
|
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
|
|
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
|
|
; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: retq
|
2016-10-21 18:50:52 +08:00
|
|
|
;
|
2017-05-25 21:45:23 +08:00
|
|
|
; AVX512VPOPCNTDQ-LABEL: testv32i8:
|
|
|
|
; AVX512VPOPCNTDQ: # BB#0:
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm1, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm0, %ymm0
|
2017-05-25 21:45:23 +08:00
|
|
|
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: retq
|
|
|
|
;
|
2016-10-21 18:50:52 +08:00
|
|
|
; X32-AVX-LABEL: testv32i8:
|
|
|
|
; X32-AVX: # BB#0:
|
|
|
|
; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; X32-AVX-NEXT: vpsubb %ymm0, %ymm1, %ymm1
|
|
|
|
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; X32-AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0
|
2016-10-21 18:50:52 +08:00
|
|
|
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
|
|
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
|
|
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpaddb %ymm2, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: retl
|
2015-06-08 05:01:34 +08:00
|
|
|
%out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 0)
|
|
|
|
ret <32 x i8> %out
|
|
|
|
}
|
|
|
|
|
2015-07-20 01:09:43 +08:00
|
|
|
define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
|
2015-06-08 05:01:34 +08:00
|
|
|
; AVX1-LABEL: testv32i8u:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm3
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
|
|
|
|
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
|
|
|
|
; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm2
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
|
[DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790)
In the best case:
extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
...we kill all of the extract/concat and just have narrow binops remaining.
If only one of the binop operands is amenable, this transform is still
worthwhile because we kill some of the extract/concat.
Optional bitcasting makes the code more complicated, but there doesn't
seem to be a way to avoid that.
The TODO about extending to more than bitwise logic is there because we really
will regress several x86 tests including madd, psad, and even a plain
integer-multiply-by-2 or shift-left-by-1. I don't think there's anything
fundamentally wrong with this patch that would cause those regressions; those
folds are just missing or brittle.
If we extend to more binops, I found that this patch will fire on at least one
non-x86 regression test. There's an ARM NEON test in
test/CodeGen/ARM/coalesce-subregs.ll with a pattern like:
t5: v2f32 = vector_shuffle<0,3> t2, t4
t6: v1i64 = bitcast t5
t8: v1i64 = BUILD_VECTOR Constant:i64<0>
t9: v2i64 = concat_vectors t6, t8
t10: v4f32 = bitcast t9
t12: v4f32 = fmul t11, t10
t13: v2i64 = bitcast t12
t16: v1i64 = extract_subvector t13, Constant:i32<0>
There was no functional change in the codegen from this transform from what I
could see though.
For the x86 test changes:
1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case,
but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops,
there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op.
SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat
ops to match the pattern.
2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract.
Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026
3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT.
4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction
count by one in each case by eliminating two insert/extract while adding one narrower logic op.
https://bugs.llvm.org/show_bug.cgi?id=32790
Differential Revision: https://reviews.llvm.org/D33137
llvm-svn: 303997
2017-05-26 23:33:18 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
|
2015-06-08 05:01:34 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: testv32i8u:
|
|
|
|
; AVX2: # BB#0:
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
|
2015-09-19 21:22:57 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
|
|
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
|
|
|
|
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
|
2015-06-08 05:01:34 +08:00
|
|
|
; AVX2-NEXT: retq
|
2016-04-20 13:19:01 +08:00
|
|
|
;
|
|
|
|
; AVX512CDVL-LABEL: testv32i8u:
|
|
|
|
; AVX512CDVL: # BB#0:
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
2016-04-20 13:19:01 +08:00
|
|
|
; AVX512CDVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512CDVL-NEXT: vpaddb %ymm1, %ymm0, %ymm0
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
|
|
; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
2016-04-20 13:19:01 +08:00
|
|
|
; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
|
|
; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
|
2016-12-28 18:12:48 +08:00
|
|
|
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0
|
2016-04-20 13:19:01 +08:00
|
|
|
; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
|
|
|
|
; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX512CDVL-NEXT: retq
|
2016-04-22 11:22:38 +08:00
|
|
|
;
|
|
|
|
; AVX512CD-LABEL: testv32i8u:
|
|
|
|
; AVX512CD: # BB#0:
|
|
|
|
; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512CD-NEXT: vpsubb %ymm0, %ymm1, %ymm1
|
|
|
|
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm0
|
2016-04-22 11:22:38 +08:00
|
|
|
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
|
|
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
|
|
; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0
|
|
|
|
; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX512CD-NEXT: retq
|
2016-10-21 18:50:52 +08:00
|
|
|
;
|
2017-05-25 21:45:23 +08:00
|
|
|
; AVX512VPOPCNTDQ-LABEL: testv32i8u:
|
|
|
|
; AVX512VPOPCNTDQ: # BB#0:
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm1, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm0, %ymm0
|
2017-05-25 21:45:23 +08:00
|
|
|
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX512VPOPCNTDQ-NEXT: retq
|
|
|
|
;
|
2016-10-21 18:50:52 +08:00
|
|
|
; X32-AVX-LABEL: testv32i8u:
|
|
|
|
; X32-AVX: # BB#0:
|
|
|
|
; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
|
|
|
|
; X32-AVX-NEXT: vpsubb %ymm0, %ymm1, %ymm1
|
|
|
|
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
|
|
|
|
; X32-AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0
|
2016-10-21 18:50:52 +08:00
|
|
|
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
|
|
|
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2
|
|
|
|
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
|
|
|
; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
|
|
|
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0
|
|
|
|
; X32-AVX-NEXT: vpaddb %ymm2, %ymm0, %ymm0
|
|
|
|
; X32-AVX-NEXT: retl
|
2015-06-08 05:01:34 +08:00
|
|
|
%out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 -1)
|
|
|
|
ret <32 x i8> %out
|
|
|
|
}
|
|
|
|
|
2015-07-20 01:09:43 +08:00
|
|
|
define <4 x i64> @foldv4i64() nounwind {
|
2016-10-21 18:50:52 +08:00
|
|
|
; AVX-LABEL: foldv4i64:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
;
|
|
|
|
; X32-AVX-LABEL: foldv4i64:
|
|
|
|
; X32-AVX: # BB#0:
|
2017-02-10 22:37:25 +08:00
|
|
|
; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,0,0,64,0,0,0]
|
2016-10-21 18:50:52 +08:00
|
|
|
; X32-AVX-NEXT: retl
|
2015-06-08 17:57:09 +08:00
|
|
|
%out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 0)
|
|
|
|
ret <4 x i64> %out
|
|
|
|
}
|
|
|
|
|
2015-07-20 01:09:43 +08:00
|
|
|
define <4 x i64> @foldv4i64u() nounwind {
|
2016-10-21 18:50:52 +08:00
|
|
|
; AVX-LABEL: foldv4i64u:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
;
|
|
|
|
; X32-AVX-LABEL: foldv4i64u:
|
|
|
|
; X32-AVX: # BB#0:
|
2017-02-10 22:37:25 +08:00
|
|
|
; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,0,0,64,0,0,0]
|
2016-10-21 18:50:52 +08:00
|
|
|
; X32-AVX-NEXT: retl
|
2015-06-08 17:57:09 +08:00
|
|
|
%out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 -1)
|
|
|
|
ret <4 x i64> %out
|
|
|
|
}
|
|
|
|
|
2015-07-20 01:09:43 +08:00
|
|
|
define <8 x i32> @foldv8i32() nounwind {
|
2016-10-21 18:50:52 +08:00
|
|
|
; AVX-LABEL: foldv8i32:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
;
|
|
|
|
; X32-AVX-LABEL: foldv8i32:
|
|
|
|
; X32-AVX: # BB#0:
|
|
|
|
; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
|
|
|
|
; X32-AVX-NEXT: retl
|
2015-06-08 17:57:09 +08:00
|
|
|
%out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0)
|
|
|
|
ret <8 x i32> %out
|
|
|
|
}
|
|
|
|
|
2015-07-20 01:09:43 +08:00
|
|
|
define <8 x i32> @foldv8i32u() nounwind {
|
2016-10-21 18:50:52 +08:00
|
|
|
; AVX-LABEL: foldv8i32u:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
;
|
|
|
|
; X32-AVX-LABEL: foldv8i32u:
|
|
|
|
; X32-AVX: # BB#0:
|
|
|
|
; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
|
|
|
|
; X32-AVX-NEXT: retl
|
2015-06-08 17:57:09 +08:00
|
|
|
%out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1)
|
|
|
|
ret <8 x i32> %out
|
|
|
|
}
|
|
|
|
|
2015-07-20 01:09:43 +08:00
|
|
|
define <16 x i16> @foldv16i16() nounwind {
|
2016-10-21 18:50:52 +08:00
|
|
|
; AVX-LABEL: foldv16i16:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
;
|
|
|
|
; X32-AVX-LABEL: foldv16i16:
|
|
|
|
; X32-AVX: # BB#0:
|
|
|
|
; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
|
|
|
|
; X32-AVX-NEXT: retl
|
2015-06-08 17:57:09 +08:00
|
|
|
%out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 0)
|
|
|
|
ret <16 x i16> %out
|
|
|
|
}
|
|
|
|
|
2015-07-20 01:09:43 +08:00
|
|
|
define <16 x i16> @foldv16i16u() nounwind {
|
2016-10-21 18:50:52 +08:00
|
|
|
; AVX-LABEL: foldv16i16u:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
;
|
|
|
|
; X32-AVX-LABEL: foldv16i16u:
|
|
|
|
; X32-AVX: # BB#0:
|
|
|
|
; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
|
|
|
|
; X32-AVX-NEXT: retl
|
2015-06-08 17:57:09 +08:00
|
|
|
%out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 -1)
|
|
|
|
ret <16 x i16> %out
|
|
|
|
}
|
|
|
|
|
2015-07-20 01:09:43 +08:00
|
|
|
define <32 x i8> @foldv32i8() nounwind {
|
2016-10-21 18:50:52 +08:00
|
|
|
; AVX-LABEL: foldv32i8:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
;
|
|
|
|
; X32-AVX-LABEL: foldv32i8:
|
|
|
|
; X32-AVX: # BB#0:
|
|
|
|
; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
|
|
|
|
; X32-AVX-NEXT: retl
|
2015-06-08 17:57:09 +08:00
|
|
|
%out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 0)
|
|
|
|
ret <32 x i8> %out
|
|
|
|
}
|
|
|
|
|
2015-07-20 01:09:43 +08:00
|
|
|
define <32 x i8> @foldv32i8u() nounwind {
|
2016-10-21 18:50:52 +08:00
|
|
|
; AVX-LABEL: foldv32i8u:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
;
|
|
|
|
; X32-AVX-LABEL: foldv32i8u:
|
|
|
|
; X32-AVX: # BB#0:
|
|
|
|
; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
|
|
|
|
; X32-AVX-NEXT: retl
|
2015-06-08 17:57:09 +08:00
|
|
|
%out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 -1)
|
|
|
|
ret <32 x i8> %out
|
|
|
|
}
|
|
|
|
|
2015-06-08 05:01:34 +08:00
|
|
|
declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>, i1)
|
|
|
|
declare <8 x i32> @llvm.cttz.v8i32(<8 x i32>, i1)
|
|
|
|
declare <16 x i16> @llvm.cttz.v16i16(<16 x i16>, i1)
|
|
|
|
declare <32 x i8> @llvm.cttz.v32i8(<32 x i8>, i1)
|