forked from OSchip/llvm-project
[AArch64] Only mark cost 1 perfect shuffles as legal
The perfect shuffle tables encode a cost of either 0 (a nop-copy) or 1 (a single instruction) with a cost encoding of 0 in the upper 2 bits. All perfect shuffles with any cost are then marked as legal shuffles though (the maximum encoded cost is 3), which can confuse the DAG combiner into thinking the shuffles are cheaper than the should be. Limiting legal shuffles to single instructions seems to do better in most case, producing less instructions for complex shuffles. There are some cases that now become tbl, which may be better or worse depending on whether the instruction is in a loop and the tbl load can be hoisted out. Differential Revision: https://reviews.llvm.org/D123377
This commit is contained in:
parent
76410040b9
commit
cc9495f679
|
@ -11489,7 +11489,9 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
|
|||
unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
|
||||
unsigned Cost = (PFEntry >> 30);
|
||||
|
||||
if (Cost <= 4)
|
||||
// The cost tables encode cost 0 or cost 1 shuffles using the value 0 in
|
||||
// the top 2 bits.
|
||||
if (Cost == 0)
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -7,10 +7,9 @@ target triple = "aarch64-unknown-linux-gnu"
|
|||
define <4 x i16> @f(<4 x i32> %vqdmlal_v3.i, <8 x i16> %x5) {
|
||||
; CHECK-LABEL: f:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
|
||||
; CHECK-NEXT: uzp1 v0.4h, v0.4h, v0.4h
|
||||
; CHECK-NEXT: ext v1.8b, v0.8b, v1.8b, #4
|
||||
; CHECK-NEXT: uzp1 v0.4h, v1.4h, v0.4h
|
||||
; CHECK-NEXT: dup v0.4h, v0.h[4]
|
||||
; CHECK-NEXT: mov v0.h[1], v1.h[0]
|
||||
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
; Check that we don't just dup the input vector. The code emitted is ext, dup, ext, ext
|
||||
|
|
|
@ -30,9 +30,8 @@ define <2 x i64> @extract0_i32_zext_insert0_i64_zero(<4 x i32> %x) {
|
|||
define <2 x i64> @extract1_i32_zext_insert0_i64_undef(<4 x i32> %x) {
|
||||
; CHECK-LABEL: extract1_i32_zext_insert0_i64_undef:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: movi v1.2d, #0000000000000000
|
||||
; CHECK-NEXT: zip1 v1.4s, v0.4s, v1.4s
|
||||
; CHECK-NEXT: trn2 v0.4s, v0.4s, v1.4s
|
||||
; CHECK-NEXT: mov w8, v0.s[1]
|
||||
; CHECK-NEXT: fmov d0, x8
|
||||
; CHECK-NEXT: ret
|
||||
%e = extractelement <4 x i32> %x, i32 1
|
||||
%z = zext i32 %e to i64
|
||||
|
@ -57,9 +56,8 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_zero(<4 x i32> %x) {
|
|||
define <2 x i64> @extract2_i32_zext_insert0_i64_undef(<4 x i32> %x) {
|
||||
; CHECK-LABEL: extract2_i32_zext_insert0_i64_undef:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: movi v1.2d, #0000000000000000
|
||||
; CHECK-NEXT: uzp1 v1.4s, v0.4s, v1.4s
|
||||
; CHECK-NEXT: zip2 v0.4s, v0.4s, v1.4s
|
||||
; CHECK-NEXT: mov w8, v0.s[2]
|
||||
; CHECK-NEXT: fmov d0, x8
|
||||
; CHECK-NEXT: ret
|
||||
%e = extractelement <4 x i32> %x, i32 2
|
||||
%z = zext i32 %e to i64
|
||||
|
@ -110,9 +108,8 @@ define <2 x i64> @extract3_i32_zext_insert0_i64_zero(<4 x i32> %x) {
|
|||
define <2 x i64> @extract0_i32_zext_insert1_i64_undef(<4 x i32> %x) {
|
||||
; CHECK-LABEL: extract0_i32_zext_insert1_i64_undef:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: movi v1.2d, #0000000000000000
|
||||
; CHECK-NEXT: zip1 v1.4s, v0.4s, v1.4s
|
||||
; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8
|
||||
; CHECK-NEXT: fmov w8, s0
|
||||
; CHECK-NEXT: dup v0.2d, x8
|
||||
; CHECK-NEXT: ret
|
||||
%e = extractelement <4 x i32> %x, i32 0
|
||||
%z = zext i32 %e to i64
|
||||
|
@ -137,9 +134,8 @@ define <2 x i64> @extract0_i32_zext_insert1_i64_zero(<4 x i32> %x) {
|
|||
define <2 x i64> @extract1_i32_zext_insert1_i64_undef(<4 x i32> %x) {
|
||||
; CHECK-LABEL: extract1_i32_zext_insert1_i64_undef:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: movi v1.2d, #0000000000000000
|
||||
; CHECK-NEXT: zip1 v0.4s, v0.4s, v0.4s
|
||||
; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #4
|
||||
; CHECK-NEXT: mov w8, v0.s[1]
|
||||
; CHECK-NEXT: dup v0.2d, x8
|
||||
; CHECK-NEXT: ret
|
||||
%e = extractelement <4 x i32> %x, i32 1
|
||||
%z = zext i32 %e to i64
|
||||
|
|
|
@ -46,9 +46,11 @@ entry:
|
|||
define <8 x i16> @v8i16_2(<4 x i16> %a, <4 x i16> %b) {
|
||||
; CHECK-LABEL: v8i16_2:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: rev64 v2.4h, v0.4h
|
||||
; CHECK-NEXT: rev64 v0.4h, v1.4h
|
||||
; CHECK-NEXT: mov v0.d[1], v2.d[0]
|
||||
; CHECK-NEXT: adrp x8, .LCPI4_0
|
||||
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
|
||||
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
|
||||
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_0]
|
||||
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%V128 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
|
||||
|
|
|
@ -136,11 +136,11 @@ entry:
|
|||
define <8 x i16> @shuffle_widen_faili1(<4 x i16> %a, <4 x i16> %b) {
|
||||
; CHECK-LABEL: shuffle_widen_faili1:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: rev32 v2.4h, v0.4h
|
||||
; CHECK-NEXT: rev32 v3.4h, v1.4h
|
||||
; CHECK-NEXT: ext v1.8b, v2.8b, v1.8b, #4
|
||||
; CHECK-NEXT: ext v0.8b, v3.8b, v0.8b, #4
|
||||
; CHECK-NEXT: mov v0.d[1], v1.d[0]
|
||||
; CHECK-NEXT: adrp x8, .LCPI12_0
|
||||
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
|
||||
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
|
||||
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_0]
|
||||
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%res = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 7, i32 6, i32 0, i32 1,
|
||||
|
@ -151,11 +151,11 @@ entry:
|
|||
define <8 x i16> @shuffle_widen_fail2(<4 x i16> %a, <4 x i16> %b) {
|
||||
; CHECK-LABEL: shuffle_widen_fail2:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: uzp1 v2.4h, v0.4h, v0.4h
|
||||
; CHECK-NEXT: trn1 v3.4h, v1.4h, v1.4h
|
||||
; CHECK-NEXT: ext v1.8b, v2.8b, v1.8b, #4
|
||||
; CHECK-NEXT: ext v0.8b, v3.8b, v0.8b, #4
|
||||
; CHECK-NEXT: mov v0.d[1], v1.d[0]
|
||||
; CHECK-NEXT: adrp x8, .LCPI13_0
|
||||
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
|
||||
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
|
||||
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_0]
|
||||
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%res = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 6, i32 6, i32 0, i32 1,
|
||||
|
|
|
@ -4,22 +4,21 @@
|
|||
define <16 x i32> @test_shuf1(<16 x i32> %x, <16 x i32> %y) {
|
||||
; CHECK-LABEL: test_shuf1:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: zip2 v3.4s, v7.4s, v6.4s
|
||||
; CHECK-NEXT: ext v5.16b, v6.16b, v4.16b, #12
|
||||
; CHECK-NEXT: uzp1 v6.4s, v1.4s, v0.4s
|
||||
; CHECK-NEXT: uzp2 v4.4s, v2.4s, v4.4s
|
||||
; CHECK-NEXT: trn2 v3.4s, v7.4s, v3.4s
|
||||
; CHECK-NEXT: ext v5.16b, v7.16b, v5.16b, #8
|
||||
; CHECK-NEXT: trn2 v6.4s, v6.4s, v1.4s
|
||||
; CHECK-NEXT: trn1 v2.4s, v4.4s, v2.4s
|
||||
; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #12
|
||||
; CHECK-NEXT: ext v3.16b, v1.16b, v3.16b, #8
|
||||
; CHECK-NEXT: rev64 v16.4s, v5.4s
|
||||
; CHECK-NEXT: dup v7.4s, v7.s[0]
|
||||
; CHECK-NEXT: ext v1.16b, v0.16b, v6.16b, #12
|
||||
; CHECK-NEXT: mov v2.s[3], v7.s[3]
|
||||
; CHECK-NEXT: ext v0.16b, v3.16b, v4.16b, #8
|
||||
; CHECK-NEXT: ext v3.16b, v5.16b, v16.16b, #8
|
||||
; CHECK-NEXT: uzp1 v16.4s, v1.4s, v0.4s
|
||||
; CHECK-NEXT: ext v3.16b, v6.16b, v4.16b, #12
|
||||
; CHECK-NEXT: zip2 v6.4s, v7.4s, v6.4s
|
||||
; CHECK-NEXT: uzp2 v17.4s, v2.4s, v4.4s
|
||||
; CHECK-NEXT: trn2 v16.4s, v16.4s, v1.4s
|
||||
; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #4
|
||||
; CHECK-NEXT: trn2 v4.4s, v7.4s, v6.4s
|
||||
; CHECK-NEXT: rev64 v5.4s, v7.4s
|
||||
; CHECK-NEXT: trn1 v2.4s, v17.4s, v2.4s
|
||||
; CHECK-NEXT: dup v6.4s, v7.s[0]
|
||||
; CHECK-NEXT: mov v4.d[1], v1.d[1]
|
||||
; CHECK-NEXT: mov v3.d[1], v5.d[1]
|
||||
; CHECK-NEXT: ext v1.16b, v0.16b, v16.16b, #12
|
||||
; CHECK-NEXT: mov v2.s[3], v6.s[3]
|
||||
; CHECK-NEXT: mov v0.16b, v4.16b
|
||||
; CHECK-NEXT: ret
|
||||
%s3 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 29, i32 26, i32 7, i32 4, i32 3, i32 6, i32 5, i32 2, i32 9, i32 8, i32 17, i32 28, i32 27, i32 16, i32 31, i32 30>
|
||||
ret <16 x i32> %s3
|
||||
|
@ -29,10 +28,9 @@ define <4 x i32> @test_shuf2(<16 x i32> %x, <16 x i32> %y) {
|
|||
; CHECK-LABEL: test_shuf2:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: zip2 v0.4s, v7.4s, v6.4s
|
||||
; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #12
|
||||
; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #4
|
||||
; CHECK-NEXT: trn2 v0.4s, v7.4s, v0.4s
|
||||
; CHECK-NEXT: ext v0.16b, v1.16b, v0.16b, #8
|
||||
; CHECK-NEXT: ext v0.16b, v0.16b, v2.16b, #8
|
||||
; CHECK-NEXT: mov v0.d[1], v1.d[1]
|
||||
; CHECK-NEXT: ret
|
||||
%s3 = shufflevector <16 x i32> %x, <16 x i32> %y, <4 x i32> <i32 29, i32 26, i32 7, i32 4>
|
||||
ret <4 x i32> %s3
|
||||
|
@ -64,10 +62,9 @@ define <4 x i32> @test_shuf4(<16 x i32> %x, <16 x i32> %y) {
|
|||
define <4 x i32> @test_shuf5(<16 x i32> %x, <16 x i32> %y) {
|
||||
; CHECK-LABEL: test_shuf5:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: rev64 v1.4s, v7.4s
|
||||
; CHECK-NEXT: ext v0.16b, v6.16b, v4.16b, #12
|
||||
; CHECK-NEXT: ext v0.16b, v7.16b, v0.16b, #8
|
||||
; CHECK-NEXT: rev64 v1.4s, v0.4s
|
||||
; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8
|
||||
; CHECK-NEXT: mov v0.d[1], v1.d[1]
|
||||
; CHECK-NEXT: ret
|
||||
%s3 = shufflevector <16 x i32> %x, <16 x i32> %y, <4 x i32> <i32 27, i32 16, i32 31, i32 30>
|
||||
ret <4 x i32> %s3
|
||||
|
|
Loading…
Reference in New Issue