[AArch64] Only mark cost 1 perfect shuffles as legal

The perfect shuffle tables encode a cost of either 0 (a nop-copy) or 1
(a single instruction) with a cost encoding of 0 in the upper 2 bits.
All perfect shuffles with any cost are then marked as legal shuffles
though (the maximum encoded cost is 3), which can confuse the DAG
combiner into thinking the shuffles are cheaper than the should be.

Limiting legal shuffles to single instructions seems to do better in
most case, producing less instructions for complex shuffles. There are
some cases that now become tbl, which may be better or worse depending
on whether the instruction is in a loop and the tbl load can be hoisted
out.

Differential Revision: https://reviews.llvm.org/D123377
This commit is contained in:
David Green 2022-04-19 12:58:55 +01:00
parent 76410040b9
commit cc9495f679
6 changed files with 48 additions and 52 deletions

View File

@ -11489,7 +11489,9 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
unsigned Cost = (PFEntry >> 30);
if (Cost <= 4)
// The cost tables encode cost 0 or cost 1 shuffles using the value 0 in
// the top 2 bits.
if (Cost == 0)
return true;
}

View File

@ -7,10 +7,9 @@ target triple = "aarch64-unknown-linux-gnu"
define <4 x i16> @f(<4 x i32> %vqdmlal_v3.i, <8 x i16> %x5) {
; CHECK-LABEL: f:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-NEXT: ext v1.8b, v0.8b, v1.8b, #4
; CHECK-NEXT: uzp1 v0.4h, v1.4h, v0.4h
; CHECK-NEXT: dup v0.4h, v0.h[4]
; CHECK-NEXT: mov v0.h[1], v1.h[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
entry:
; Check that we don't just dup the input vector. The code emitted is ext, dup, ext, ext

View File

@ -30,9 +30,8 @@ define <2 x i64> @extract0_i32_zext_insert0_i64_zero(<4 x i32> %x) {
define <2 x i64> @extract1_i32_zext_insert0_i64_undef(<4 x i32> %x) {
; CHECK-LABEL: extract1_i32_zext_insert0_i64_undef:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: zip1 v1.4s, v0.4s, v1.4s
; CHECK-NEXT: trn2 v0.4s, v0.4s, v1.4s
; CHECK-NEXT: mov w8, v0.s[1]
; CHECK-NEXT: fmov d0, x8
; CHECK-NEXT: ret
%e = extractelement <4 x i32> %x, i32 1
%z = zext i32 %e to i64
@ -57,9 +56,8 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_zero(<4 x i32> %x) {
define <2 x i64> @extract2_i32_zext_insert0_i64_undef(<4 x i32> %x) {
; CHECK-LABEL: extract2_i32_zext_insert0_i64_undef:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: uzp1 v1.4s, v0.4s, v1.4s
; CHECK-NEXT: zip2 v0.4s, v0.4s, v1.4s
; CHECK-NEXT: mov w8, v0.s[2]
; CHECK-NEXT: fmov d0, x8
; CHECK-NEXT: ret
%e = extractelement <4 x i32> %x, i32 2
%z = zext i32 %e to i64
@ -110,9 +108,8 @@ define <2 x i64> @extract3_i32_zext_insert0_i64_zero(<4 x i32> %x) {
define <2 x i64> @extract0_i32_zext_insert1_i64_undef(<4 x i32> %x) {
; CHECK-LABEL: extract0_i32_zext_insert1_i64_undef:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: zip1 v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: dup v0.2d, x8
; CHECK-NEXT: ret
%e = extractelement <4 x i32> %x, i32 0
%z = zext i32 %e to i64
@ -137,9 +134,8 @@ define <2 x i64> @extract0_i32_zext_insert1_i64_zero(<4 x i32> %x) {
define <2 x i64> @extract1_i32_zext_insert1_i64_undef(<4 x i32> %x) {
; CHECK-LABEL: extract1_i32_zext_insert1_i64_undef:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: zip1 v0.4s, v0.4s, v0.4s
; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #4
; CHECK-NEXT: mov w8, v0.s[1]
; CHECK-NEXT: dup v0.2d, x8
; CHECK-NEXT: ret
%e = extractelement <4 x i32> %x, i32 1
%z = zext i32 %e to i64

View File

@ -46,9 +46,11 @@ entry:
define <8 x i16> @v8i16_2(<4 x i16> %a, <4 x i16> %b) {
; CHECK-LABEL: v8i16_2:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: rev64 v2.4h, v0.4h
; CHECK-NEXT: rev64 v0.4h, v1.4h
; CHECK-NEXT: mov v0.d[1], v2.d[0]
; CHECK-NEXT: adrp x8, .LCPI4_0
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_0]
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-NEXT: ret
entry:
%V128 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>

View File

@ -136,11 +136,11 @@ entry:
define <8 x i16> @shuffle_widen_faili1(<4 x i16> %a, <4 x i16> %b) {
; CHECK-LABEL: shuffle_widen_faili1:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: rev32 v2.4h, v0.4h
; CHECK-NEXT: rev32 v3.4h, v1.4h
; CHECK-NEXT: ext v1.8b, v2.8b, v1.8b, #4
; CHECK-NEXT: ext v0.8b, v3.8b, v0.8b, #4
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: adrp x8, .LCPI12_0
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_0]
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-NEXT: ret
entry:
%res = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 7, i32 6, i32 0, i32 1,
@ -151,11 +151,11 @@ entry:
define <8 x i16> @shuffle_widen_fail2(<4 x i16> %a, <4 x i16> %b) {
; CHECK-LABEL: shuffle_widen_fail2:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uzp1 v2.4h, v0.4h, v0.4h
; CHECK-NEXT: trn1 v3.4h, v1.4h, v1.4h
; CHECK-NEXT: ext v1.8b, v2.8b, v1.8b, #4
; CHECK-NEXT: ext v0.8b, v3.8b, v0.8b, #4
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: adrp x8, .LCPI13_0
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_0]
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-NEXT: ret
entry:
%res = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 6, i32 6, i32 0, i32 1,

View File

@ -4,22 +4,21 @@
define <16 x i32> @test_shuf1(<16 x i32> %x, <16 x i32> %y) {
; CHECK-LABEL: test_shuf1:
; CHECK: // %bb.0:
; CHECK-NEXT: zip2 v3.4s, v7.4s, v6.4s
; CHECK-NEXT: ext v5.16b, v6.16b, v4.16b, #12
; CHECK-NEXT: uzp1 v6.4s, v1.4s, v0.4s
; CHECK-NEXT: uzp2 v4.4s, v2.4s, v4.4s
; CHECK-NEXT: trn2 v3.4s, v7.4s, v3.4s
; CHECK-NEXT: ext v5.16b, v7.16b, v5.16b, #8
; CHECK-NEXT: trn2 v6.4s, v6.4s, v1.4s
; CHECK-NEXT: trn1 v2.4s, v4.4s, v2.4s
; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #12
; CHECK-NEXT: ext v3.16b, v1.16b, v3.16b, #8
; CHECK-NEXT: rev64 v16.4s, v5.4s
; CHECK-NEXT: dup v7.4s, v7.s[0]
; CHECK-NEXT: ext v1.16b, v0.16b, v6.16b, #12
; CHECK-NEXT: mov v2.s[3], v7.s[3]
; CHECK-NEXT: ext v0.16b, v3.16b, v4.16b, #8
; CHECK-NEXT: ext v3.16b, v5.16b, v16.16b, #8
; CHECK-NEXT: uzp1 v16.4s, v1.4s, v0.4s
; CHECK-NEXT: ext v3.16b, v6.16b, v4.16b, #12
; CHECK-NEXT: zip2 v6.4s, v7.4s, v6.4s
; CHECK-NEXT: uzp2 v17.4s, v2.4s, v4.4s
; CHECK-NEXT: trn2 v16.4s, v16.4s, v1.4s
; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #4
; CHECK-NEXT: trn2 v4.4s, v7.4s, v6.4s
; CHECK-NEXT: rev64 v5.4s, v7.4s
; CHECK-NEXT: trn1 v2.4s, v17.4s, v2.4s
; CHECK-NEXT: dup v6.4s, v7.s[0]
; CHECK-NEXT: mov v4.d[1], v1.d[1]
; CHECK-NEXT: mov v3.d[1], v5.d[1]
; CHECK-NEXT: ext v1.16b, v0.16b, v16.16b, #12
; CHECK-NEXT: mov v2.s[3], v6.s[3]
; CHECK-NEXT: mov v0.16b, v4.16b
; CHECK-NEXT: ret
%s3 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 29, i32 26, i32 7, i32 4, i32 3, i32 6, i32 5, i32 2, i32 9, i32 8, i32 17, i32 28, i32 27, i32 16, i32 31, i32 30>
ret <16 x i32> %s3
@ -29,10 +28,9 @@ define <4 x i32> @test_shuf2(<16 x i32> %x, <16 x i32> %y) {
; CHECK-LABEL: test_shuf2:
; CHECK: // %bb.0:
; CHECK-NEXT: zip2 v0.4s, v7.4s, v6.4s
; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #12
; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #4
; CHECK-NEXT: trn2 v0.4s, v7.4s, v0.4s
; CHECK-NEXT: ext v0.16b, v1.16b, v0.16b, #8
; CHECK-NEXT: ext v0.16b, v0.16b, v2.16b, #8
; CHECK-NEXT: mov v0.d[1], v1.d[1]
; CHECK-NEXT: ret
%s3 = shufflevector <16 x i32> %x, <16 x i32> %y, <4 x i32> <i32 29, i32 26, i32 7, i32 4>
ret <4 x i32> %s3
@ -64,10 +62,9 @@ define <4 x i32> @test_shuf4(<16 x i32> %x, <16 x i32> %y) {
define <4 x i32> @test_shuf5(<16 x i32> %x, <16 x i32> %y) {
; CHECK-LABEL: test_shuf5:
; CHECK: // %bb.0:
; CHECK-NEXT: rev64 v1.4s, v7.4s
; CHECK-NEXT: ext v0.16b, v6.16b, v4.16b, #12
; CHECK-NEXT: ext v0.16b, v7.16b, v0.16b, #8
; CHECK-NEXT: rev64 v1.4s, v0.4s
; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8
; CHECK-NEXT: mov v0.d[1], v1.d[1]
; CHECK-NEXT: ret
%s3 = shufflevector <16 x i32> %x, <16 x i32> %y, <4 x i32> <i32 27, i32 16, i32 31, i32 30>
ret <4 x i32> %s3