[SVE][CodeGen] Restructure SVE fixed length tests to use update_llc_test_checks.

Most tests have been updated to make use of vscale_range to reduce
the number of RUN lines.  For the remaining RUN lines the check
prefixes have been updated to ensure the original expectation of
the manual CHECK lines is maintained after update_llc_test_checks
is run.
This commit is contained in:
Paul Walker 2022-06-13 17:06:22 +01:00
parent af6ec9200b
commit fcd058acc9
47 changed files with 20179 additions and 27069 deletions

File diff suppressed because it is too large Load Diff

View File

@ -1,31 +1,17 @@
; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE when its registers are no bigger than NEON.
; NO_SVE-NOT: ptrue
; Don't use SVE for 64-bit vectors.
define void @bitcast_v4i16(<4 x i16> *%a, <4 x half>* %b) #0 {
define void @bitcast_v4i16(<4 x i16> *%a, <4 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v4i16:
; CHECK: ldr d0, [x0]
; CHECK-NEXT: str d0, [x1]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: str d0, [x1]
; CHECK-NEXT: ret
%load = load volatile <4 x i16>, <4 x i16>* %a
%cast = bitcast <4 x i16> %load to <4 x half>
store volatile <4 x half> %cast, <4 x half>* %b
@ -33,23 +19,25 @@ define void @bitcast_v4i16(<4 x i16> *%a, <4 x half>* %b) #0 {
}
; Don't use SVE for 128-bit vectors.
define void @bitcast_v8i16(<8 x i16> *%a, <8 x half>* %b) #0 {
define void @bitcast_v8i16(<8 x i16> *%a, <8 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v8i16:
; CHECK: ldr q0, [x0]
; CHECK-NEXT: str q0, [x1]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: str q0, [x1]
; CHECK-NEXT: ret
%load = load volatile <8 x i16>, <8 x i16>* %a
%cast = bitcast <8 x i16> %load to <8 x half>
store volatile <8 x half> %cast, <8 x half>* %b
ret void
}
define void @bitcast_v16i16(<16 x i16> *%a, <16 x half>* %b) #0 {
define void @bitcast_v16i16(<16 x i16> *%a, <16 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v16i16:
; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
; CHECK-NEXT: st1h { [[OP]].h }, [[PG]], [x1]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; CHECK-NEXT: ret
%load = load volatile <16 x i16>, <16 x i16>* %a
%cast = bitcast <16 x i16> %load to <16 x half>
store volatile <16 x half> %cast, <16 x half>* %b
@ -57,35 +45,48 @@ define void @bitcast_v16i16(<16 x i16> *%a, <16 x half>* %b) #0 {
}
define void @bitcast_v32i16(<32 x i16> *%a, <32 x half>* %b) #0 {
; CHECK-LABEL: bitcast_v32i16:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_512-NEXT: st1h { [[OP]].h }, [[PG]], [x1]
; VBITS_GE_512-NEXT: ret
; VBITS_GE_256-LABEL: bitcast_v32i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: bitcast_v32i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%load = load volatile <32 x i16>, <32 x i16>* %a
%cast = bitcast <32 x i16> %load to <32 x half>
store volatile <32 x half> %cast, <32 x half>* %b
ret void
}
define void @bitcast_v64i16(<64 x i16> *%a, <64 x half>* %b) #0 {
define void @bitcast_v64i16(<64 x i16> *%a, <64 x half>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: bitcast_v64i16:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_1024-NEXT: st1h { [[OP]].h }, [[PG]], [x1]
; VBITS_GE_1024-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; CHECK-NEXT: ret
%load = load volatile <64 x i16>, <64 x i16>* %a
%cast = bitcast <64 x i16> %load to <64 x half>
store volatile <64 x half> %cast, <64 x half>* %b
ret void
}
define void @bitcast_v128i16(<128 x i16> *%a, <128 x half>* %b) #0 {
define void @bitcast_v128i16(<128 x i16> *%a, <128 x half>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: bitcast_v128i16:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_2048-NEXT: st1h { [[OP]].h }, [[PG]], [x1]
; VBITS_GE_2048-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; CHECK-NEXT: ret
%load = load volatile <128 x i16>, <128 x i16>* %a
%cast = bitcast <128 x i16> %load to <128 x half>
store volatile <128 x half> %cast, <128 x half>* %b
@ -93,11 +94,12 @@ define void @bitcast_v128i16(<128 x i16> *%a, <128 x half>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
define void @bitcast_v2i32(<2 x i32> *%a, <2 x float>* %b) #0 {
define void @bitcast_v2i32(<2 x i32> *%a, <2 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v2i32:
; CHECK: ldr d0, [x0]
; CHECK-NEXT: str d0, [x1]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: str d0, [x1]
; CHECK-NEXT: ret
%load = load volatile <2 x i32>, <2 x i32>* %a
%cast = bitcast <2 x i32> %load to <2 x float>
store volatile <2 x float> %cast, <2 x float>* %b
@ -105,23 +107,25 @@ define void @bitcast_v2i32(<2 x i32> *%a, <2 x float>* %b) #0 {
}
; Don't use SVE for 128-bit vectors.
define void @bitcast_v4i32(<4 x i32> *%a, <4 x float>* %b) #0 {
define void @bitcast_v4i32(<4 x i32> *%a, <4 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v4i32:
; CHECK: ldr q0, [x0]
; CHECK-NEXT: str q0, [x1]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: str q0, [x1]
; CHECK-NEXT: ret
%load = load volatile <4 x i32>, <4 x i32>* %a
%cast = bitcast <4 x i32> %load to <4 x float>
store volatile <4 x float> %cast, <4 x float>* %b
ret void
}
define void @bitcast_v8i32(<8 x i32> *%a, <8 x float>* %b) #0 {
define void @bitcast_v8i32(<8 x i32> *%a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v8i32:
; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
; CHECK-NEXT: st1w { [[OP]].s }, [[PG]], [x1]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%load = load volatile <8 x i32>, <8 x i32>* %a
%cast = bitcast <8 x i32> %load to <8 x float>
store volatile <8 x float> %cast, <8 x float>* %b
@ -129,35 +133,48 @@ define void @bitcast_v8i32(<8 x i32> *%a, <8 x float>* %b) #0 {
}
define void @bitcast_v16i32(<16 x i32> *%a, <16 x float>* %b) #0 {
; CHECK-LABEL: bitcast_v16i32:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_512-NEXT: st1w { [[OP]].s }, [[PG]], [x1]
; VBITS_GE_512-NEXT: ret
; VBITS_GE_256-LABEL: bitcast_v16i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: bitcast_v16i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%load = load volatile <16 x i32>, <16 x i32>* %a
%cast = bitcast <16 x i32> %load to <16 x float>
store volatile <16 x float> %cast, <16 x float>* %b
ret void
}
define void @bitcast_v32i32(<32 x i32> *%a, <32 x float>* %b) #0 {
define void @bitcast_v32i32(<32 x i32> *%a, <32 x float>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: bitcast_v32i32:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_1024-NEXT: st1w { [[OP]].s }, [[PG]], [x1]
; VBITS_GE_1024-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%load = load volatile <32 x i32>, <32 x i32>* %a
%cast = bitcast <32 x i32> %load to <32 x float>
store volatile <32 x float> %cast, <32 x float>* %b
ret void
}
define void @bitcast_v64i32(<64 x i32> *%a, <64 x float>* %b) #0 {
define void @bitcast_v64i32(<64 x i32> *%a, <64 x float>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: bitcast_v64i32:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_2048-NEXT: st1w { [[OP]].s }, [[PG]], [x1]
; VBITS_GE_2048-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%load = load volatile <64 x i32>, <64 x i32>* %a
%cast = bitcast <64 x i32> %load to <64 x float>
store volatile <64 x float> %cast, <64 x float>* %b
@ -165,11 +182,12 @@ define void @bitcast_v64i32(<64 x i32> *%a, <64 x float>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
define void @bitcast_v1i64(<1 x i64> *%a, <1 x double>* %b) #0 {
define void @bitcast_v1i64(<1 x i64> *%a, <1 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v1i64:
; CHECK: ldr d0, [x0]
; CHECK-NEXT: str d0, [x1]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: str d0, [x1]
; CHECK-NEXT: ret
%load = load volatile <1 x i64>, <1 x i64>* %a
%cast = bitcast <1 x i64> %load to <1 x double>
store volatile <1 x double> %cast, <1 x double>* %b
@ -177,23 +195,25 @@ define void @bitcast_v1i64(<1 x i64> *%a, <1 x double>* %b) #0 {
}
; Don't use SVE for 128-bit vectors.
define void @bitcast_v2i64(<2 x i64> *%a, <2 x double>* %b) #0 {
define void @bitcast_v2i64(<2 x i64> *%a, <2 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v2i64:
; CHECK: ldr q0, [x0]
; CHECK-NEXT: str q0, [x1]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: str q0, [x1]
; CHECK-NEXT: ret
%load = load volatile <2 x i64>, <2 x i64>* %a
%cast = bitcast <2 x i64> %load to <2 x double>
store volatile <2 x double> %cast, <2 x double>* %b
ret void
}
define void @bitcast_v4i64(<4 x i64> *%a, <4 x double>* %b) #0 {
define void @bitcast_v4i64(<4 x i64> *%a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v4i64:
; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
; CHECK-NEXT: st1d { [[OP]].d }, [[PG]], [x1]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%load = load volatile <4 x i64>, <4 x i64>* %a
%cast = bitcast <4 x i64> %load to <4 x double>
store volatile <4 x double> %cast, <4 x double>* %b
@ -201,35 +221,48 @@ define void @bitcast_v4i64(<4 x i64> *%a, <4 x double>* %b) #0 {
}
define void @bitcast_v8i64(<8 x i64> *%a, <8 x double>* %b) #0 {
; CHECK-LABEL: bitcast_v8i64:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_512-NEXT: st1d { [[OP]].d }, [[PG]], [x1]
; VBITS_GE_512-NEXT: ret
; VBITS_GE_256-LABEL: bitcast_v8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: bitcast_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%load = load volatile <8 x i64>, <8 x i64>* %a
%cast = bitcast <8 x i64> %load to <8 x double>
store volatile <8 x double> %cast, <8 x double>* %b
ret void
}
define void @bitcast_v16i64(<16 x i64> *%a, <16 x double>* %b) #0 {
define void @bitcast_v16i64(<16 x i64> *%a, <16 x double>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: bitcast_v16i64:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_1024-NEXT: st1d { [[OP]].d }, [[PG]], [x1]
; VBITS_GE_1024-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%load = load volatile <16 x i64>, <16 x i64>* %a
%cast = bitcast <16 x i64> %load to <16 x double>
store volatile <16 x double> %cast, <16 x double>* %b
ret void
}
define void @bitcast_v32i64(<32 x i64> *%a, <32 x double>* %b) #0 {
define void @bitcast_v32i64(<32 x i64> *%a, <32 x double>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: bitcast_v32i64:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_2048-NEXT: st1d { [[OP]].d }, [[PG]], [x1]
; VBITS_GE_2048-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%load = load volatile <32 x i64>, <32 x i64>* %a
%cast = bitcast <32 x i64> %load to <32 x double>
store volatile <32 x double> %cast, <32 x double>* %b

File diff suppressed because it is too large Load Diff

View File

@ -1,25 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; Don't use SVE when its registers are no bigger than NEON.
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048
target triple = "aarch64-unknown-linux-gnu"
define <4 x i32> @load_zext_v4i16i32(<4 x i16>* %ap) #0 {
define <4 x i32> @load_zext_v4i16i32(<4 x i16>* %ap) vscale_range(2,0) #0 {
; CHECK-LABEL: load_zext_v4i16i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@ -49,7 +34,7 @@ define <2 x i256> @load_zext_v2i64i256(<2 x i64>* %ap) #0 {
ret <2 x i256> %val
}
define <8 x i32> @load_zext_v8i16i32(<8 x i16>* %ap) #0 {
define <8 x i32> @load_zext_v8i16i32(<8 x i16>* %ap) vscale_range(2,0) #0 {
; CHECK-LABEL: load_zext_v8i16i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@ -61,103 +46,43 @@ define <8 x i32> @load_zext_v8i16i32(<8 x i16>* %ap) #0 {
ret <8 x i32> %val
}
define <16 x i32> @load_zext_v16i16i32(<16 x i16>* %ap) #0 {
; VBITS_GE_256-LABEL: load_zext_v16i16i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: load_zext_v16i16i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalistaion
define <16 x i32> @load_zext_v16i16i32(<16 x i16>* %ap) vscale_range(4,0) #0 {
; CHECK-LABEL: load_zext_v16i16i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl16
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
; CHECK-NEXT: st1w { z0.s }, p0, [x8]
; CHECK-NEXT: ret
%a = load <16 x i16>, <16 x i16>* %ap
%val = zext <16 x i16> %a to <16 x i32>
ret <16 x i32> %val
}
define <32 x i32> @load_zext_v32i16i32(<32 x i16>* %ap) #0 {
; VBITS_GE_256-LABEL: load_zext_v32i16i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: mov x10, #24
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: uunpklo z2.s, z0.h
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: load_zext_v32i16i32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1h { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
define <32 x i32> @load_zext_v32i16i32(<32 x i16>* %ap) vscale_range(8,0) #0 {
; CHECK-LABEL: load_zext_v32i16i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
; CHECK-NEXT: st1w { z0.s }, p0, [x8]
; CHECK-NEXT: ret
%a = load <32 x i16>, <32 x i16>* %ap
%val = zext <32 x i16> %a to <32 x i32>
ret <32 x i32> %val
}
define <64 x i32> @load_zext_v64i16i32(<64 x i16>* %ap) #0 {
; VBITS_GE_256-LABEL: load_zext_v64i16i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: mov x10, #32
; VBITS_GE_256-NEXT: mov x11, #48
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: mov x12, #24
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x11, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: uunpklo z4.s, z0.h
; VBITS_GE_256-NEXT: uunpklo z5.s, z1.h
; VBITS_GE_256-NEXT: uunpklo z6.s, z2.h
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x8, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x8, x10, lsl #2]
; VBITS_GE_256-NEXT: mov x10, #56
; VBITS_GE_256-NEXT: mov x11, #40
; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
; VBITS_GE_256-NEXT: uunpklo z7.s, z3.h
; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h
; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x8]
; VBITS_GE_256-NEXT: ret
; VBITS_GE_1024-LABEL: load_zext_v64i16i32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: mov x9, #32
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: uunpklo z1.s, z0.h
; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_zext_v64i16i32:
; VBITS_GE_2048: // %bb.0:
@ -170,7 +95,7 @@ define <64 x i32> @load_zext_v64i16i32(<64 x i16>* %ap) #0 {
ret <64 x i32> %val
}
define <4 x i32> @load_sext_v4i16i32(<4 x i16>* %ap) #0 {
define <4 x i32> @load_sext_v4i16i32(<4 x i16>* %ap) vscale_range(2,0) #0 {
; CHECK-LABEL: load_sext_v4i16i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@ -181,7 +106,7 @@ define <4 x i32> @load_sext_v4i16i32(<4 x i16>* %ap) #0 {
ret <4 x i32> %val
}
define <8 x i32> @load_sext_v8i16i32(<8 x i16>* %ap) #0 {
define <8 x i32> @load_sext_v8i16i32(<8 x i16>* %ap) vscale_range(2,0) #0 {
; CHECK-LABEL: load_sext_v8i16i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@ -193,103 +118,43 @@ define <8 x i32> @load_sext_v8i16i32(<8 x i16>* %ap) #0 {
ret <8 x i32> %val
}
define <16 x i32> @load_sext_v16i16i32(<16 x i16>* %ap) #0 {
; VBITS_GE_256-LABEL: load_sext_v16i16i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: load_sext_v16i16i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalistaion
define <16 x i32> @load_sext_v16i16i32(<16 x i16>* %ap) vscale_range(4,0) #0 {
; CHECK-LABEL: load_sext_v16i16i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl16
; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
; CHECK-NEXT: st1w { z0.s }, p0, [x8]
; CHECK-NEXT: ret
%a = load <16 x i16>, <16 x i16>* %ap
%val = sext <16 x i16> %a to <16 x i32>
ret <16 x i32> %val
}
define <32 x i32> @load_sext_v32i16i32(<32 x i16>* %ap) #0 {
; VBITS_GE_256-LABEL: load_sext_v32i16i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: mov x10, #24
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: sunpklo z2.s, z0.h
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: load_sext_v32i16i32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1sh { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
define <32 x i32> @load_sext_v32i16i32(<32 x i16>* %ap) vscale_range(8,0) #0 {
; CHECK-LABEL: load_sext_v32i16i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
; CHECK-NEXT: st1w { z0.s }, p0, [x8]
; CHECK-NEXT: ret
%a = load <32 x i16>, <32 x i16>* %ap
%val = sext <32 x i16> %a to <32 x i32>
ret <32 x i32> %val
}
define <64 x i32> @load_sext_v64i16i32(<64 x i16>* %ap) #0 {
; VBITS_GE_256-LABEL: load_sext_v64i16i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: mov x10, #32
; VBITS_GE_256-NEXT: mov x11, #48
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: mov x12, #24
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x11, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: sunpklo z4.s, z0.h
; VBITS_GE_256-NEXT: sunpklo z5.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z6.s, z2.h
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x8, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x8, x10, lsl #2]
; VBITS_GE_256-NEXT: mov x10, #56
; VBITS_GE_256-NEXT: mov x11, #40
; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
; VBITS_GE_256-NEXT: sunpklo z7.s, z3.h
; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x8]
; VBITS_GE_256-NEXT: ret
; VBITS_GE_1024-LABEL: load_sext_v64i16i32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: mov x9, #32
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: sunpklo z1.s, z0.h
; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_1024-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_sext_v64i16i32:
; VBITS_GE_2048: // %bb.0:
@ -303,52 +168,22 @@ define <64 x i32> @load_sext_v64i16i32(<64 x i16>* %ap) #0 {
}
define <32 x i64> @load_zext_v32i8i64(<32 x i8>* %ap) #0 {
; VBITS_GE_256-LABEL: load_zext_v32i8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: mov x10, #24
; VBITS_GE_256-NEXT: ushll2 v2.8h, v0.16b, #0
; VBITS_GE_256-NEXT: ushll v1.8h, v0.8b, #0
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: ext v3.16b, v2.16b, v2.16b, #8
; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h
; VBITS_GE_256-NEXT: ushll2 v4.8h, v0.16b, #0
; VBITS_GE_256-NEXT: uunpklo z2.d, z2.s
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x9, #12
; VBITS_GE_256-NEXT: uunpklo z2.s, z4.h
; VBITS_GE_256-NEXT: ext v4.16b, v4.16b, v4.16b, #8
; VBITS_GE_256-NEXT: uunpklo z2.d, z2.s
; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: mov x10, #28
; VBITS_GE_256-NEXT: uunpklo z2.d, z3.s
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: ext v2.16b, v1.16b, v1.16b, #8
; VBITS_GE_256-NEXT: uunpklo z3.s, z4.h
; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0
; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: mov x10, #4
; VBITS_GE_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8
; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h
; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: uunpklo z0.d, z2.s
; VBITS_GE_256-NEXT: mov x9, #20
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: uunpklo z0.s, z1.h
; VBITS_GE_256-NEXT: uunpklo z1.s, z3.h
; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
; VBITS_GE_256-NEXT: ret
; VBITS_GE_1024-LABEL: load_zext_v32i8i64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.b, vl32
; VBITS_GE_1024-NEXT: mov x9, #16
; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: uunpklo z1.h, z0.b
; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_1024-NEXT: uunpklo z0.h, z0.b
; VBITS_GE_1024-NEXT: uunpklo z1.s, z1.h
; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_1024-NEXT: uunpklo z1.d, z1.s
; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_zext_v32i8i64:
; VBITS_GE_2048: // %bb.0:
@ -362,52 +197,22 @@ define <32 x i64> @load_zext_v32i8i64(<32 x i8>* %ap) #0 {
}
define <32 x i64> @load_sext_v32i8i64(<32 x i8>* %ap) #0 {
; VBITS_GE_256-LABEL: load_sext_v32i8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: mov x10, #24
; VBITS_GE_256-NEXT: sshll2 v2.8h, v0.16b, #0
; VBITS_GE_256-NEXT: sshll v1.8h, v0.8b, #0
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: ext v3.16b, v2.16b, v2.16b, #8
; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
; VBITS_GE_256-NEXT: sshll2 v4.8h, v0.16b, #0
; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x9, #12
; VBITS_GE_256-NEXT: sunpklo z2.s, z4.h
; VBITS_GE_256-NEXT: ext v4.16b, v4.16b, v4.16b, #8
; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: mov x10, #28
; VBITS_GE_256-NEXT: sunpklo z2.d, z3.s
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: ext v2.16b, v1.16b, v1.16b, #8
; VBITS_GE_256-NEXT: sunpklo z3.s, z4.h
; VBITS_GE_256-NEXT: sshll v0.8h, v0.8b, #0
; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: mov x10, #4
; VBITS_GE_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8
; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: sunpklo z0.d, z2.s
; VBITS_GE_256-NEXT: mov x9, #20
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: sunpklo z0.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z1.s, z3.h
; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
; VBITS_GE_256-NEXT: ret
; VBITS_GE_1024-LABEL: load_sext_v32i8i64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.b, vl32
; VBITS_GE_1024-NEXT: mov x9, #16
; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: sunpklo z1.h, z0.b
; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_1024-NEXT: sunpklo z0.h, z0.b
; VBITS_GE_1024-NEXT: sunpklo z1.s, z1.h
; VBITS_GE_1024-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_1024-NEXT: sunpklo z1.d, z1.s
; VBITS_GE_1024-NEXT: sunpklo z0.d, z0.s
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_sext_v32i8i64:
; VBITS_GE_2048: // %bb.0:
@ -421,50 +226,20 @@ define <32 x i64> @load_sext_v32i8i64(<32 x i8>* %ap) #0 {
}
define <32 x i64> @load_zext_v32i16i64(<32 x i16>* %ap) #0 {
; VBITS_GE_256-LABEL: load_zext_v32i16i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: mov x10, #24
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ext v2.16b, v0.16b, v0.16b, #8
; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h
; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: ext v5.16b, v1.16b, v1.16b, #8
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x9, #20
; VBITS_GE_256-NEXT: uunpklo z4.s, z1.h
; VBITS_GE_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8
; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h
; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
; VBITS_GE_256-NEXT: uunpklo z2.d, z2.s
; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: ext v6.16b, v1.16b, v1.16b, #8
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: mov x10, #4
; VBITS_GE_256-NEXT: uunpklo z5.s, z5.h
; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: uunpklo z5.d, z5.s
; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
; VBITS_GE_256-NEXT: uunpklo z0.s, z3.h
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x9, #28
; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: mov x10, #12
; VBITS_GE_256-NEXT: uunpklo z2.s, z6.h
; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: uunpklo z1.d, z4.s
; VBITS_GE_256-NEXT: uunpklo z2.d, z2.s
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
; VBITS_GE_256-NEXT: ret
; VBITS_GE_1024-LABEL: load_zext_v32i16i64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
; VBITS_GE_1024-NEXT: mov x9, #16
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: uunpklo z1.s, z0.h
; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #32
; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_1024-NEXT: uunpklo z1.d, z1.s
; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_zext_v32i16i64:
; VBITS_GE_2048: // %bb.0:
@ -478,50 +253,20 @@ define <32 x i64> @load_zext_v32i16i64(<32 x i16>* %ap) #0 {
}
define <32 x i64> @load_sext_v32i16i64(<32 x i16>* %ap) #0 {
; VBITS_GE_256-LABEL: load_sext_v32i16i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: mov x10, #24
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ext v2.16b, v0.16b, v0.16b, #8
; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h
; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: ext v5.16b, v1.16b, v1.16b, #8
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x9, #20
; VBITS_GE_256-NEXT: sunpklo z4.s, z1.h
; VBITS_GE_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8
; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: ext v6.16b, v1.16b, v1.16b, #8
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: mov x10, #4
; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h
; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s
; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
; VBITS_GE_256-NEXT: sunpklo z0.s, z3.h
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x9, #28
; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: mov x10, #12
; VBITS_GE_256-NEXT: sunpklo z2.s, z6.h
; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: sunpklo z1.d, z4.s
; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
; VBITS_GE_256-NEXT: ret
; VBITS_GE_1024-LABEL: load_sext_v32i16i64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
; VBITS_GE_1024-NEXT: mov x9, #16
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: sunpklo z1.s, z0.h
; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #32
; VBITS_GE_1024-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_1024-NEXT: sunpklo z1.d, z1.s
; VBITS_GE_1024-NEXT: sunpklo z0.d, z0.s
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_sext_v32i16i64:
; VBITS_GE_2048: // %bb.0:
@ -535,42 +280,18 @@ define <32 x i64> @load_sext_v32i16i64(<32 x i16>* %ap) #0 {
}
define <32 x i64> @load_zext_v32i32i64(<32 x i32>* %ap) #0 {
; VBITS_GE_256-LABEL: load_zext_v32i32i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: mov x10, #16
; VBITS_GE_256-NEXT: mov x11, #24
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: mov x12, #12
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: uunpklo z4.d, z0.s
; VBITS_GE_256-NEXT: uunpklo z5.d, z1.s
; VBITS_GE_256-NEXT: uunpklo z6.d, z2.s
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x8, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: mov x10, #28
; VBITS_GE_256-NEXT: mov x11, #20
; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x9, #4
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
; VBITS_GE_256-NEXT: uunpklo z7.d, z3.s
; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
; VBITS_GE_256-NEXT: uunpklo z2.d, z2.s
; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x12, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x8]
; VBITS_GE_256-NEXT: ret
; VBITS_GE_1024-LABEL: load_zext_v32i32i64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: mov x9, #16
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: uunpklo z1.d, z0.s
; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_zext_v32i32i64:
; VBITS_GE_2048: // %bb.0:
@ -584,42 +305,18 @@ define <32 x i64> @load_zext_v32i32i64(<32 x i32>* %ap) #0 {
}
define <32 x i64> @load_sext_v32i32i64(<32 x i32>* %ap) #0 {
; VBITS_GE_256-LABEL: load_sext_v32i32i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: mov x10, #16
; VBITS_GE_256-NEXT: mov x11, #24
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: mov x12, #12
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: sunpklo z4.d, z0.s
; VBITS_GE_256-NEXT: sunpklo z5.d, z1.s
; VBITS_GE_256-NEXT: sunpklo z6.d, z2.s
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x8, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: mov x10, #28
; VBITS_GE_256-NEXT: mov x11, #20
; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x9, #4
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
; VBITS_GE_256-NEXT: sunpklo z7.d, z3.s
; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x12, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x8]
; VBITS_GE_256-NEXT: ret
; VBITS_GE_1024-LABEL: load_sext_v32i32i64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: mov x9, #16
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: sunpklo z1.d, z0.s
; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_1024-NEXT: sunpklo z0.d, z0.s
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_sext_v32i32i64:
; VBITS_GE_2048: // %bb.0:

View File

@ -1,28 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; Don't use SVE when its registers are no bigger than NEON.
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
; i8
; Don't use SVE for 64-bit vectors.
define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) #0 {
define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: zip2 v0.8b, v0.8b, v0.8b
@ -32,7 +18,7 @@ define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) #0 {
}
; Don't use SVE for 128-bit vectors.
define <8 x i8> @extract_subvector_v16i8(<16 x i8> %op) #0 {
define <8 x i8> @extract_subvector_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
@ -42,7 +28,7 @@ define <8 x i8> @extract_subvector_v16i8(<16 x i8> %op) #0 {
ret <8 x i8> %ret
}
define void @extract_subvector_v32i8(<32 x i8>* %a, <16 x i8>* %b) #0 {
define void @extract_subvector_v32i8(<32 x i8>* %a, <16 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl32
@ -79,62 +65,30 @@ define void @extract_subvector_v64i8(<64 x i8>* %a, <32 x i8>* %b) #0 {
ret void
}
define void @extract_subvector_v128i8(<128 x i8>* %a, <64 x i8>* %b) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v128i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov w8, #64
; VBITS_GE_256-NEXT: mov w9, #96
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9]
; VBITS_GE_256-NEXT: mov w8, #32
; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x1, x8]
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: extract_subvector_v128i8:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.b, vl128
; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.b, vl64
; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @extract_subvector_v128i8(<128 x i8>* %a, <64 x i8>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: extract_subvector_v128i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl128
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ptrue p0.b, vl64
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64
; CHECK-NEXT: st1b { z0.b }, p0, [x1]
; CHECK-NEXT: ret
%op = load <128 x i8>, <128 x i8>* %a
%ret = call <64 x i8> @llvm.experimental.vector.extract.v64i8.v128i8(<128 x i8> %op, i64 64)
store <64 x i8> %ret, <64 x i8>* %b
ret void
}
define void @extract_subvector_v256i8(<256 x i8>* %a, <128 x i8>* %b) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v256i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov w8, #128
; VBITS_GE_256-NEXT: mov w9, #160
; VBITS_GE_256-NEXT: mov w10, #224
; VBITS_GE_256-NEXT: mov w11, #192
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9]
; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x10]
; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0, x11]
; VBITS_GE_256-NEXT: mov w8, #64
; VBITS_GE_256-NEXT: mov w9, #96
; VBITS_GE_256-NEXT: mov w10, #32
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x1, x8]
; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x1, x9]
; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x1, x10]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: extract_subvector_v256i8:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.b, vl256
; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.b, vl128
; VBITS_GE_2048-NEXT: ext z0.b, z0.b, z0.b, #128
; VBITS_GE_2048-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
define void @extract_subvector_v256i8(<256 x i8>* %a, <128 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: extract_subvector_v256i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl256
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ptrue p0.b, vl128
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; CHECK-NEXT: st1b { z0.b }, p0, [x1]
; CHECK-NEXT: ret
%op = load <256 x i8>, <256 x i8>* %a
%ret = call <128 x i8> @llvm.experimental.vector.extract.v128i8.v256i8(<256 x i8> %op, i64 128)
store <128 x i8> %ret, <128 x i8>* %b
@ -144,7 +98,7 @@ define void @extract_subvector_v256i8(<256 x i8>* %a, <128 x i8>* %b) #0 {
; i16
; Don't use SVE for 64-bit vectors.
define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) #0 {
define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@ -159,7 +113,7 @@ define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) #0 {
}
; Don't use SVE for 128-bit vectors.
define <4 x i16> @extract_subvector_v8i16(<8 x i16> %op) #0 {
define <4 x i16> @extract_subvector_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
@ -169,7 +123,7 @@ define <4 x i16> @extract_subvector_v8i16(<8 x i16> %op) #0 {
ret <4 x i16> %ret
}
define void @extract_subvector_v16i16(<16 x i16>* %a, <8 x i16>* %b) #0 {
define void @extract_subvector_v16i16(<16 x i16>* %a, <8 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v16i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -206,62 +160,30 @@ define void @extract_subvector_v32i16(<32 x i16>* %a, <16 x i16>* %b) #0 {
ret void
}
define void @extract_subvector_v64i16(<64 x i16>* %a, <32 x i16>* %b) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v64i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #32
; VBITS_GE_256-NEXT: mov x9, #48
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: extract_subvector_v64i16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @extract_subvector_v64i16(<64 x i16>* %a, <32 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: extract_subvector_v64i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ptrue p0.h, vl32
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64
; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; CHECK-NEXT: ret
%op = load <64 x i16>, <64 x i16>* %a
%ret = call <32 x i16> @llvm.experimental.vector.extract.v32i16.v64i16(<64 x i16> %op, i64 32)
store <32 x i16> %ret, <32 x i16>* %b
ret void
}
define void @extract_subvector_v128i16(<128 x i16>* %a, <64 x i16>* %b) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v128i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #64
; VBITS_GE_256-NEXT: mov x9, #80
; VBITS_GE_256-NEXT: mov x10, #112
; VBITS_GE_256-NEXT: mov x11, #96
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1]
; VBITS_GE_256-NEXT: mov x8, #32
; VBITS_GE_256-NEXT: mov x9, #48
; VBITS_GE_256-NEXT: mov x10, #16
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: extract_subvector_v128i16:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.h, vl64
; VBITS_GE_2048-NEXT: ext z0.b, z0.b, z0.b, #128
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
define void @extract_subvector_v128i16(<128 x i16>* %a, <64 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: extract_subvector_v128i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; CHECK-NEXT: ret
%op = load <128 x i16>, <128 x i16>* %a
%ret = call <64 x i16> @llvm.experimental.vector.extract.v64i16.v128i16(<128 x i16> %op, i64 64)
store <64 x i16> %ret, <64 x i16>* %b
@ -271,7 +193,7 @@ define void @extract_subvector_v128i16(<128 x i16>* %a, <64 x i16>* %b) #0 {
; i32
; Don't use SVE for 64-bit vectors.
define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) #0 {
define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@ -282,7 +204,7 @@ define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) #0 {
}
; Don't use SVE for 128-bit vectors.
define <2 x i32> @extract_subvector_v4i32(<4 x i32> %op) #0 {
define <2 x i32> @extract_subvector_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
@ -292,7 +214,7 @@ define <2 x i32> @extract_subvector_v4i32(<4 x i32> %op) #0 {
ret <2 x i32> %ret
}
define void @extract_subvector_v8i32(<8 x i32>* %a, <4 x i32>* %b) #0 {
define void @extract_subvector_v8i32(<8 x i32>* %a, <4 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@ -329,62 +251,30 @@ define void @extract_subvector_v16i32(<16 x i32>* %a, <8 x i32>* %b) #0 {
ret void
}
define void @extract_subvector_v32i32(<32 x i32>* %a, <16 x i32>* %b) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v32i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: mov x9, #24
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: extract_subvector_v32i32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @extract_subvector_v32i32(<32 x i32>* %a, <16 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: extract_subvector_v32i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ptrue p0.s, vl16
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%op = load <32 x i32>, <32 x i32>* %a
%ret = call <16 x i32> @llvm.experimental.vector.extract.v16i32.v32i32(<32 x i32> %op, i64 16)
store <16 x i32> %ret, <16 x i32>* %b
ret void
}
define void @extract_subvector_v64i32(<64 x i32>* %a, <32 x i32>* %b) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v64i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #32
; VBITS_GE_256-NEXT: mov x9, #40
; VBITS_GE_256-NEXT: mov x10, #56
; VBITS_GE_256-NEXT: mov x11, #48
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: mov x9, #24
; VBITS_GE_256-NEXT: mov x10, #8
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: extract_subvector_v64i32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
; VBITS_GE_2048-NEXT: ext z0.b, z0.b, z0.b, #128
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
define void @extract_subvector_v64i32(<64 x i32>* %a, <32 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: extract_subvector_v64i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%op = load <64 x i32>, <64 x i32>* %a
%ret = call <32 x i32> @llvm.experimental.vector.extract.v32i32.v64i32(<64 x i32> %op, i64 32)
store <32 x i32> %ret, <32 x i32>* %b
@ -394,7 +284,7 @@ define void @extract_subvector_v64i32(<64 x i32>* %a, <32 x i32>* %b) #0 {
; i64
; Don't use SVE for 128-bit vectors.
define <1 x i64> @extract_subvector_v2i64(<2 x i64> %op) #0 {
define <1 x i64> @extract_subvector_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
@ -404,7 +294,7 @@ define <1 x i64> @extract_subvector_v2i64(<2 x i64> %op) #0 {
ret <1 x i64> %ret
}
define void @extract_subvector_v4i64(<4 x i64>* %a, <2 x i64>* %b) #0 {
define void @extract_subvector_v4i64(<4 x i64>* %a, <2 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@ -418,23 +308,14 @@ define void @extract_subvector_v4i64(<4 x i64>* %a, <2 x i64>* %b) #0 {
ret void
}
define void @extract_subvector_v8i64(<8 x i64>* %a, <4 x i64>* %b) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: extract_subvector_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ptrue p0.d, vl4
; VBITS_GE_512-NEXT: ext z0.b, z0.b, z0.b, #32
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret
define void @extract_subvector_v8i64(<8 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v8i64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x8, #4
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%op = load <8 x i64>, <8 x i64>* %a
%ret = call <4 x i64> @llvm.experimental.vector.extract.v4i64.v8i64(<8 x i64> %op, i64 4)
store <4 x i64> %ret, <4 x i64>* %b
@ -453,50 +334,20 @@ define void @extract_subvector_v16i64(<16 x i64>* %a, <8 x i64>* %b) #0 {
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: extract_subvector_v16i64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.d, vl8
; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%op = load <16 x i64>, <16 x i64>* %a
%ret = call <8 x i64> @llvm.experimental.vector.extract.v8i64.v16i64(<16 x i64> %op, i64 8)
store <8 x i64> %ret, <8 x i64>* %b
ret void
}
define void @extract_subvector_v32i64(<32 x i64>* %a, <16 x i64>* %b) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v32i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: mov x9, #20
; VBITS_GE_256-NEXT: mov x10, #28
; VBITS_GE_256-NEXT: mov x11, #24
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: mov x9, #12
; VBITS_GE_256-NEXT: mov x10, #4
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: extract_subvector_v32i64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.d, vl16
; VBITS_GE_2048-NEXT: ext z0.b, z0.b, z0.b, #128
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
define void @extract_subvector_v32i64(<32 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: extract_subvector_v32i64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x8, #16
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%op = load <32 x i64>, <32 x i64>* %a
%ret = call <16 x i64> @llvm.experimental.vector.extract.v16i64.v32i64(<32 x i64> %op, i64 16)
store <16 x i64> %ret, <16 x i64>* %b
@ -506,7 +357,7 @@ define void @extract_subvector_v32i64(<32 x i64>* %a, <16 x i64>* %b) #0 {
; f16
; Don't use SVE for 64-bit vectors.
define <2 x half> @extract_subvector_v4f16(<4 x half> %op) #0 {
define <2 x half> @extract_subvector_v4f16(<4 x half> %op) vscale_range(16,0) #0 {
; CHECK-LABEL: extract_subvector_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@ -517,7 +368,7 @@ define <2 x half> @extract_subvector_v4f16(<4 x half> %op) #0 {
}
; Don't use SVE for 128-bit vectors.
define <4 x half> @extract_subvector_v8f16(<8 x half> %op) #0 {
define <4 x half> @extract_subvector_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
@ -527,7 +378,7 @@ define <4 x half> @extract_subvector_v8f16(<8 x half> %op) #0 {
ret <4 x half> %ret
}
define void @extract_subvector_v16f16(<16 x half>* %a, <8 x half>* %b) #0 {
define void @extract_subvector_v16f16(<16 x half>* %a, <8 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -564,62 +415,30 @@ define void @extract_subvector_v32f16(<32 x half>* %a, <16 x half>* %b) #0 {
ret void
}
define void @extract_subvector_v64f16(<64 x half>* %a, <32 x half>* %b) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v64f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #32
; VBITS_GE_256-NEXT: mov x9, #48
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: extract_subvector_v64f16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @extract_subvector_v64f16(<64 x half>* %a, <32 x half>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: extract_subvector_v64f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ptrue p0.h, vl32
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64
; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; CHECK-NEXT: ret
%op = load <64 x half>, <64 x half>* %a
%ret = call <32 x half> @llvm.experimental.vector.extract.v32f16.v64f16(<64 x half> %op, i64 32)
store <32 x half> %ret, <32 x half>* %b
ret void
}
define void @extract_subvector_v128f16(<128 x half>* %a, <64 x half>* %b) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v128f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #64
; VBITS_GE_256-NEXT: mov x9, #80
; VBITS_GE_256-NEXT: mov x10, #112
; VBITS_GE_256-NEXT: mov x11, #96
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1]
; VBITS_GE_256-NEXT: mov x8, #32
; VBITS_GE_256-NEXT: mov x9, #48
; VBITS_GE_256-NEXT: mov x10, #16
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: extract_subvector_v128f16:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.h, vl64
; VBITS_GE_2048-NEXT: ext z0.b, z0.b, z0.b, #128
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
define void @extract_subvector_v128f16(<128 x half>* %a, <64 x half>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: extract_subvector_v128f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; CHECK-NEXT: ret
%op = load <128 x half>, <128 x half>* %a
%ret = call <64 x half> @llvm.experimental.vector.extract.v64f16.v128f16(<128 x half> %op, i64 64)
store <64 x half> %ret, <64 x half>* %b
@ -629,7 +448,7 @@ define void @extract_subvector_v128f16(<128 x half>* %a, <64 x half>* %b) #0 {
; f32
; Don't use SVE for 64-bit vectors.
define <1 x float> @extract_subvector_v2f32(<2 x float> %op) #0 {
define <1 x float> @extract_subvector_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@ -640,7 +459,7 @@ define <1 x float> @extract_subvector_v2f32(<2 x float> %op) #0 {
}
; Don't use SVE for 128-bit vectors.
define <2 x float> @extract_subvector_v4f32(<4 x float> %op) #0 {
define <2 x float> @extract_subvector_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
@ -650,7 +469,7 @@ define <2 x float> @extract_subvector_v4f32(<4 x float> %op) #0 {
ret <2 x float> %ret
}
define void @extract_subvector_v8f32(<8 x float>* %a, <4 x float>* %b) #0 {
define void @extract_subvector_v8f32(<8 x float>* %a, <4 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@ -687,62 +506,30 @@ define void @extract_subvector_v16f32(<16 x float>* %a, <8 x float>* %b) #0 {
ret void
}
define void @extract_subvector_v32f32(<32 x float>* %a, <16 x float>* %b) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v32f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: mov x9, #24
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: extract_subvector_v32f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @extract_subvector_v32f32(<32 x float>* %a, <16 x float>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: extract_subvector_v32f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ptrue p0.s, vl16
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%op = load <32 x float>, <32 x float>* %a
%ret = call <16 x float> @llvm.experimental.vector.extract.v16f32.v32f32(<32 x float> %op, i64 16)
store <16 x float> %ret, <16 x float>* %b
ret void
}
define void @extract_subvector_v64f32(<64 x float>* %a, <32 x float>* %b) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v64f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #32
; VBITS_GE_256-NEXT: mov x9, #40
; VBITS_GE_256-NEXT: mov x10, #56
; VBITS_GE_256-NEXT: mov x11, #48
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: mov x9, #24
; VBITS_GE_256-NEXT: mov x10, #8
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: extract_subvector_v64f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
; VBITS_GE_2048-NEXT: ext z0.b, z0.b, z0.b, #128
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
define void @extract_subvector_v64f32(<64 x float>* %a, <32 x float>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: extract_subvector_v64f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%op = load <64 x float>, <64 x float>* %a
%ret = call <32 x float> @llvm.experimental.vector.extract.v32f32.v64f32(<64 x float> %op, i64 32)
store <32 x float> %ret, <32 x float>* %b
@ -752,7 +539,7 @@ define void @extract_subvector_v64f32(<64 x float>* %a, <32 x float>* %b) #0 {
; f64
; Don't use SVE for 128-bit vectors.
define <1 x double> @extract_subvector_v2f64(<2 x double> %op) #0 {
define <1 x double> @extract_subvector_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
@ -762,7 +549,7 @@ define <1 x double> @extract_subvector_v2f64(<2 x double> %op) #0 {
ret <1 x double> %ret
}
define void @extract_subvector_v4f64(<4 x double>* %a, <2 x double>* %b) #0 {
define void @extract_subvector_v4f64(<4 x double>* %a, <2 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@ -799,62 +586,30 @@ define void @extract_subvector_v8f64(<8 x double>* %a, <4 x double>* %b) #0 {
ret void
}
define void @extract_subvector_v16f64(<16 x double>* %a, <8 x double>* %b) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v16f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: mov x9, #12
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: extract_subvector_v16f64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.d, vl8
; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @extract_subvector_v16f64(<16 x double>* %a, <8 x double>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: extract_subvector_v16f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ptrue p0.d, vl8
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%op = load <16 x double>, <16 x double>* %a
%ret = call <8 x double> @llvm.experimental.vector.extract.v8f64.v16f64(<16 x double> %op, i64 8)
store <8 x double> %ret, <8 x double>* %b
ret void
}
define void @extract_subvector_v32f64(<32 x double>* %a, <16 x double>* %b) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v32f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: mov x9, #20
; VBITS_GE_256-NEXT: mov x10, #28
; VBITS_GE_256-NEXT: mov x11, #24
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: mov x9, #12
; VBITS_GE_256-NEXT: mov x10, #4
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: extract_subvector_v32f64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.d, vl16
; VBITS_GE_2048-NEXT: ext z0.b, z0.b, z0.b, #128
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
define void @extract_subvector_v32f64(<32 x double>* %a, <16 x double>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: extract_subvector_v32f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%op = load <32 x double>, <32 x double>* %a
%ret = call <16 x double> @llvm.experimental.vector.extract.v16f64.v32f64(<32 x double> %op, i64 16)
store <16 x double> %ret, <16 x double>* %b

View File

@ -1,221 +1,259 @@
; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE when its registers are no bigger than NEON.
; NO_SVE-NOT: ptrue
;
; extractelement
;
; Don't use SVE for 64-bit vectors.
define half @extractelement_v4f16(<4 x half> %op1) #0 {
define half @extractelement_v4f16(<4 x half> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v4f16:
; CHECK: mov h0, v0.h[3]
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov h0, v0.h[3]
; CHECK-NEXT: ret
%r = extractelement <4 x half> %op1, i64 3
ret half %r
}
; Don't use SVE for 128-bit vectors.
define half @extractelement_v8f16(<8 x half> %op1) #0 {
define half @extractelement_v8f16(<8 x half> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v8f16:
; CHECK: mov h0, v0.h[7]
; CHECK: // %bb.0:
; CHECK-NEXT: mov h0, v0.h[7]
; CHECK-NEXT: ret
%r = extractelement <8 x half> %op1, i64 7
ret half %r
}
define half @extractelement_v16f16(<16 x half>* %a) #0 {
define half @extractelement_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v16f16:
; VBITS_GE_256: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov z0.h, z0.h[15]
; VBITS_GE_256-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: mov z0.h, z0.h[15]
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
; CHECK-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a
%r = extractelement <16 x half> %op1, i64 15
ret half %r
}
define half @extractelement_v32f16(<32 x half>* %a) #0 {
; CHECK-LABEL: extractelement_v32f16:
; VBITS_GE_512: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-LABEL: extractelement_v32f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: mov z0.h, z0.h[15]
; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 killed $z0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: extractelement_v32f16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: mov z0.h, z0.h[31]
; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 killed $z0
; VBITS_GE_512-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%r = extractelement <32 x half> %op1, i64 31
ret half %r
}
define half @extractelement_v64f16(<64 x half>* %a) #0 {
define half @extractelement_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: extractelement_v64f16:
; VBITS_GE_1024: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: mov w8, #63
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: whilels p0.h, xzr, x8
; VBITS_GE_1024-NEXT: lastb h0, p0, z0.h
; VBITS_GE_1024-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: mov w8, #63
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: whilels p0.h, xzr, x8
; CHECK-NEXT: lastb h0, p0, z0.h
; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%r = extractelement <64 x half> %op1, i64 63
ret half %r
}
define half @extractelement_v128f16(<128 x half>* %a) #0 {
define half @extractelement_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: extractelement_v128f16:
; VBITS_GE_2048: ptrue p0.h, vl128
; VBITS_GE_2048-NEXT: mov w8, #127
; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: whilels p0.h, xzr, x8
; VBITS_GE_2048-NEXT: lastb h0, p0, z0.h
; VBITS_GE_2048-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: mov w8, #127
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: whilels p0.h, xzr, x8
; CHECK-NEXT: lastb h0, p0, z0.h
; CHECK-NEXT: ret
%op1 = load <128 x half>, <128 x half>* %a
%r = extractelement <128 x half> %op1, i64 127
ret half %r
}
; Don't use SVE for 64-bit vectors.
define float @extractelement_v2f32(<2 x float> %op1) #0 {
define float @extractelement_v2f32(<2 x float> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v2f32:
; CHECK: mov s0, v0.s[1]
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov s0, v0.s[1]
; CHECK-NEXT: ret
%r = extractelement <2 x float> %op1, i64 1
ret float %r
}
; Don't use SVE for 128-bit vectors.
define float @extractelement_v4f32(<4 x float> %op1) #0 {
define float @extractelement_v4f32(<4 x float> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v4f32:
; CHECK: mov s0, v0.s[3]
; CHECK: // %bb.0:
; CHECK-NEXT: mov s0, v0.s[3]
; CHECK-NEXT: ret
%r = extractelement <4 x float> %op1, i64 3
ret float %r
}
define float @extractelement_v8f32(<8 x float>* %a) #0 {
define float @extractelement_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v8f32:
; VBITS_GE_256: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov z0.s, z0.s[7]
; VBITS_GE_256-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: mov z0.s, z0.s[7]
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
; CHECK-NEXT: ret
%op1 = load <8 x float>, <8 x float>* %a
%r = extractelement <8 x float> %op1, i64 7
ret float %r
}
define float @extractelement_v16f32(<16 x float>* %a) #0 {
; CHECK-LABEL: extractelement_v16f32:
; VBITS_GE_512: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-LABEL: extractelement_v16f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: mov z0.s, z0.s[7]
; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 killed $z0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: extractelement_v16f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: mov z0.s, z0.s[15]
; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 killed $z0
; VBITS_GE_512-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%r = extractelement <16 x float> %op1, i64 15
ret float %r
}
define float @extractelement_v32f32(<32 x float>* %a) #0 {
define float @extractelement_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: extractelement_v32f32:
; VBITS_GE_1024: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: mov w8, #31
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: whilels p0.s, xzr, x8
; VBITS_GE_1024-NEXT: lastb s0, p0, z0.s
; VBITS_GE_1024-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: mov w8, #31
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: whilels p0.s, xzr, x8
; CHECK-NEXT: lastb s0, p0, z0.s
; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%r = extractelement <32 x float> %op1, i64 31
ret float %r
}
define float @extractelement_v64f32(<64 x float>* %a) #0 {
define float @extractelement_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: extractelement_v64f32:
; VBITS_GE_2048: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: mov w8, #63
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: whilels p0.s, xzr, x8
; VBITS_GE_2048-NEXT: lastb s0, p0, z0.s
; VBITS_GE_2048-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: mov w8, #63
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: whilels p0.s, xzr, x8
; CHECK-NEXT: lastb s0, p0, z0.s
; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%r = extractelement <64 x float> %op1, i64 63
ret float %r
}
; Don't use SVE for 64-bit vectors.
define double @extractelement_v1f64(<1 x double> %op1) #0 {
define double @extractelement_v1f64(<1 x double> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v1f64:
; CHECK: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ret
%r = extractelement <1 x double> %op1, i64 0
ret double %r
}
; Don't use SVE for 128-bit vectors.
define double @extractelement_v2f64(<2 x double> %op1) #0 {
define double @extractelement_v2f64(<2 x double> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v2f64:
; CHECK: mov d0, v0.d[1]
; CHECK: // %bb.0:
; CHECK-NEXT: mov d0, v0.d[1]
; CHECK-NEXT: ret
%r = extractelement <2 x double> %op1, i64 1
ret double %r
}
define double @extractelement_v4f64(<4 x double>* %a) #0 {
define double @extractelement_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v4f64:
; VBITS_GE_256: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov z0.d, z0.d[3]
; VBITS_GE_256-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: mov z0.d, z0.d[3]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%op1 = load <4 x double>, <4 x double>* %a
%r = extractelement <4 x double> %op1, i64 3
ret double %r
}
define double @extractelement_v8f64(<8 x double>* %a) #0 {
; CHECK-LABEL: extractelement_v8f64:
; VBITS_GE_512: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_256-LABEL: extractelement_v8f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: mov z0.d, z0.d[3]
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: extractelement_v8f64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: mov z0.d, z0.d[7]
; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0
; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a
%r = extractelement <8 x double> %op1, i64 7
ret double %r
}
define double @extractelement_v16f64(<16 x double>* %a) #0 {
define double @extractelement_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: extractelement_v16f64:
; VBITS_GE_1024: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: mov w8, #15
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: whilels p0.d, xzr, x8
; VBITS_GE_1024-NEXT: lastb d0, p0, z0.d
; VBITS_GE_1024-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: mov w8, #15
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: whilels p0.d, xzr, x8
; CHECK-NEXT: lastb d0, p0, z0.d
; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%r = extractelement <16 x double> %op1, i64 15
ret double %r
}
define double @extractelement_v32f64(<32 x double>* %a) #0 {
define double @extractelement_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: extractelement_v32f64:
; VBITS_GE_2048: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: mov w8, #31
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: whilels p0.d, xzr, x8
; VBITS_GE_2048-NEXT: lastb d0, p0, z0.d
; VBITS_GE_2048-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: mov w8, #31
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: whilels p0.d, xzr, x8
; CHECK-NEXT: lastb d0, p0, z0.d
; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%r = extractelement <32 x double> %op1, i64 31
ret double %r

File diff suppressed because it is too large Load Diff

View File

@ -1,21 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; Don't use SVE when its registers are no bigger than NEON.
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep 'z[0-9]'
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
@ -24,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
;
; Don't use SVE for 64-bit vectors.
define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: fcmeq v0.4h, v0.4h, v1.4h
@ -35,7 +21,7 @@ define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
}
; Don't use SVE for 128-bit vectors.
define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: fcmeq v0.8h, v0.8h, v1.8h
@ -45,7 +31,7 @@ define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
ret <8 x i16> %sext
}
define void @fcmp_oeq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
define void @fcmp_oeq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -64,7 +50,6 @@ define void @fcmp_oeq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
}
define void @fcmp_oeq_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x i16>* %c) #0 {
; Ensure sensible type legalisation
; VBITS_GE_256-LABEL: fcmp_oeq_v32f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16
@ -98,44 +83,16 @@ define void @fcmp_oeq_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x i16>* %c) #
ret void
}
define void @fcmp_oeq_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x i16>* %c) #0 {
; VBITS_GE_256-LABEL: fcmp_oeq_v64f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #32
; VBITS_GE_256-NEXT: mov x9, #48
; VBITS_GE_256-NEXT: mov x10, #16
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z2.h, z5.h
; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z1.h, z4.h
; VBITS_GE_256-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z0.h, z6.h
; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z3.h, z7.h
; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x2, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: fcmp_oeq_v64f16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
; VBITS_GE_1024-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x2]
; VBITS_GE_1024-NEXT: ret
define void @fcmp_oeq_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x i16>* %c) vscale_range(8,0) #0 {
; CHECK-LABEL: fcmp_oeq_v64f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1h { z0.h }, p0, [x2]
; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%op2 = load <64 x half>, <64 x half>* %b
%cmp = fcmp oeq <64 x half> %op1, %op2
@ -144,68 +101,16 @@ define void @fcmp_oeq_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x i16>* %c) #
ret void
}
define void @fcmp_oeq_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x i16>* %c) #0 {
; VBITS_GE_256-LABEL: fcmp_oeq_v128f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #96
; VBITS_GE_256-NEXT: mov x9, #112
; VBITS_GE_256-NEXT: mov x10, #64
; VBITS_GE_256-NEXT: mov x11, #80
; VBITS_GE_256-NEXT: mov x12, #32
; VBITS_GE_256-NEXT: mov x13, #48
; VBITS_GE_256-NEXT: mov x14, #16
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x0, x14, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z16.h }, p0/z, [x1, x13, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z17.h }, p0/z, [x1, x14, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z20.h }, p0/z, [x1, x11, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z21.h }, p0/z, [x1, x12, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z19.h }, p0/z, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z22.h }, p0/z, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z18.h }, p0/z, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z23.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z6.h, z17.h
; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z5.h, z16.h
; VBITS_GE_256-NEXT: mov z5.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z6.h, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z4.h, z21.h
; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z3.h, z20.h
; VBITS_GE_256-NEXT: mov z3.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z4.h, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z2.h, z22.h
; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z1.h, z19.h
; VBITS_GE_256-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z0.h, z18.h
; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z7.h, z23.h
; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z7.h, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x2, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x2, x11, lsl #1]
; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x2, x12, lsl #1]
; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x2, x13, lsl #1]
; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x2, x14, lsl #1]
; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: fcmp_oeq_v128f16:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
; VBITS_GE_2048-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x2]
; VBITS_GE_2048-NEXT: ret
define void @fcmp_oeq_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x i16>* %c) vscale_range(16,0) #0 {
; CHECK-LABEL: fcmp_oeq_v128f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1h { z0.h }, p0, [x2]
; CHECK-NEXT: ret
%op1 = load <128 x half>, <128 x half>* %a
%op2 = load <128 x half>, <128 x half>* %b
%cmp = fcmp oeq <128 x half> %op1, %op2
@ -215,7 +120,7 @@ define void @fcmp_oeq_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x i16>* %
}
; Don't use SVE for 64-bit vectors.
define <2 x i32> @fcmp_oeq_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
define <2 x i32> @fcmp_oeq_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fcmeq v0.2s, v0.2s, v1.2s
@ -226,7 +131,7 @@ define <2 x i32> @fcmp_oeq_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
}
; Don't use SVE for 128-bit vectors.
define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fcmeq v0.4s, v0.4s, v1.4s
@ -236,7 +141,7 @@ define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
ret <4 x i32> %sext
}
define void @fcmp_oeq_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x i32>* %c) #0 {
define void @fcmp_oeq_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x i32>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@ -255,7 +160,6 @@ define void @fcmp_oeq_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x i32>* %c) #0
}
define void @fcmp_oeq_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x i32>* %c) #0 {
; Ensure sensible type legalisation
; VBITS_GE_256-LABEL: fcmp_oeq_v16f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
@ -289,44 +193,16 @@ define void @fcmp_oeq_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x i32>* %c)
ret void
}
define void @fcmp_oeq_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x i32>* %c) #0 {
; VBITS_GE_256-LABEL: fcmp_oeq_v32f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: mov x9, #24
; VBITS_GE_256-NEXT: mov x10, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z2.s, z5.s
; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z4.s
; VBITS_GE_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z2.s, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z6.s
; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, z7.s
; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x2, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: fcmp_oeq_v32f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_1024-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
; VBITS_GE_1024-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x2]
; VBITS_GE_1024-NEXT: ret
define void @fcmp_oeq_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x i32>* %c) vscale_range(8,0) #0 {
; CHECK-LABEL: fcmp_oeq_v32f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1w { z0.s }, p0, [x2]
; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%op2 = load <32 x float>, <32 x float>* %b
%cmp = fcmp oeq <32 x float> %op1, %op2
@ -335,68 +211,16 @@ define void @fcmp_oeq_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x i32>* %c)
ret void
}
define void @fcmp_oeq_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x i32>* %c) #0 {
; VBITS_GE_256-LABEL: fcmp_oeq_v64f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #48
; VBITS_GE_256-NEXT: mov x9, #56
; VBITS_GE_256-NEXT: mov x10, #32
; VBITS_GE_256-NEXT: mov x11, #40
; VBITS_GE_256-NEXT: mov x12, #16
; VBITS_GE_256-NEXT: mov x13, #24
; VBITS_GE_256-NEXT: mov x14, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1, x14, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z20.s }, p0/z, [x1, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z21.s }, p0/z, [x1, x12, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z19.s }, p0/z, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z22.s }, p0/z, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z18.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z23.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z6.s, z17.s
; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z5.s, z16.s
; VBITS_GE_256-NEXT: mov z5.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z6.s, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z4.s, z21.s
; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, z20.s
; VBITS_GE_256-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z4.s, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z2.s, z22.s
; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z19.s
; VBITS_GE_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z2.s, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z18.s
; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z7.s, z23.s
; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z7.s, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x2, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x2, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x2, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x2, x13, lsl #2]
; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x2, x14, lsl #2]
; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: fcmp_oeq_v64f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
; VBITS_GE_2048-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x2]
; VBITS_GE_2048-NEXT: ret
define void @fcmp_oeq_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x i32>* %c) vscale_range(16,0) #0 {
; CHECK-LABEL: fcmp_oeq_v64f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1w { z0.s }, p0, [x2]
; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%op2 = load <64 x float>, <64 x float>* %b
%cmp = fcmp oeq <64 x float> %op1, %op2
@ -406,7 +230,7 @@ define void @fcmp_oeq_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x i32>* %c)
}
; Don't use SVE for 64-bit vectors.
define <1 x i64> @fcmp_oeq_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
define <1 x i64> @fcmp_oeq_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: fcmeq d0, d0, d1
@ -417,7 +241,7 @@ define <1 x i64> @fcmp_oeq_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
}
; Don't use SVE for 128-bit vectors.
define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: fcmeq v0.2d, v0.2d, v1.2d
@ -427,7 +251,7 @@ define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
ret <2 x i64> %sext
}
define void @fcmp_oeq_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x i64>* %c) #0 {
define void @fcmp_oeq_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x i64>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@ -446,7 +270,6 @@ define void @fcmp_oeq_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x i64>* %c) #
}
define void @fcmp_oeq_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x i64>* %c) #0 {
; Ensure sensible type legalisation
; VBITS_GE_256-LABEL: fcmp_oeq_v8f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
@ -480,44 +303,16 @@ define void @fcmp_oeq_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x i64>* %c) #
ret void
}
define void @fcmp_oeq_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x i64>* %c) #0 {
; VBITS_GE_256-LABEL: fcmp_oeq_v16f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: mov x9, #12
; VBITS_GE_256-NEXT: mov x10, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z2.d, z5.d
; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, z4.d
; VBITS_GE_256-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z2.d, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z6.d
; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z3.d, z7.d
; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z3.d, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x2, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: fcmp_oeq_v16f64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_1024-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
; VBITS_GE_1024-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x2]
; VBITS_GE_1024-NEXT: ret
define void @fcmp_oeq_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x i64>* %c) vscale_range(8,0) #0 {
; CHECK-LABEL: fcmp_oeq_v16f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1d { z0.d }, p0, [x2]
; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%op2 = load <16 x double>, <16 x double>* %b
%cmp = fcmp oeq <16 x double> %op1, %op2
@ -526,68 +321,16 @@ define void @fcmp_oeq_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x i64>* %
ret void
}
define void @fcmp_oeq_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x i64>* %c) #0 {
; VBITS_GE_256-LABEL: fcmp_oeq_v32f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #24
; VBITS_GE_256-NEXT: mov x9, #28
; VBITS_GE_256-NEXT: mov x10, #16
; VBITS_GE_256-NEXT: mov x11, #20
; VBITS_GE_256-NEXT: mov x12, #8
; VBITS_GE_256-NEXT: mov x13, #12
; VBITS_GE_256-NEXT: mov x14, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1d { z16.d }, p0/z, [x1, x13, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1, x14, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z20.d }, p0/z, [x1, x11, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z21.d }, p0/z, [x1, x12, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z19.d }, p0/z, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z22.d }, p0/z, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z18.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z23.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z6.d, z17.d
; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z5.d, z16.d
; VBITS_GE_256-NEXT: mov z5.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z6.d, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z4.d, z21.d
; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z3.d, z20.d
; VBITS_GE_256-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z4.d, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z2.d, z22.d
; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, z19.d
; VBITS_GE_256-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z2.d, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z18.d
; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z7.d, z23.d
; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z7.d, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x2, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x2, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x2, x12, lsl #3]
; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x2, x13, lsl #3]
; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x2, x14, lsl #3]
; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: fcmp_oeq_v32f64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_2048-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
; VBITS_GE_2048-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x2]
; VBITS_GE_2048-NEXT: ret
define void @fcmp_oeq_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x i64>* %c) vscale_range(16,0) #0 {
; CHECK-LABEL: fcmp_oeq_v32f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1d { z0.d }, p0, [x2]
; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%op2 = load <32 x double>, <32 x double>* %b
%cmp = fcmp oeq <32 x double> %op1, %op2
@ -600,7 +343,7 @@ define void @fcmp_oeq_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x i64>* %
; FCMP UEQ
;
define void @fcmp_ueq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
define void @fcmp_ueq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ueq_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -624,7 +367,7 @@ define void @fcmp_ueq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP ONE
;
define void @fcmp_one_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
define void @fcmp_one_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_one_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -648,7 +391,7 @@ define void @fcmp_one_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP UNE
;
define void @fcmp_une_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
define void @fcmp_une_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_une_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -670,7 +413,7 @@ define void @fcmp_une_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP OGT
;
define void @fcmp_ogt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
define void @fcmp_ogt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ogt_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -692,7 +435,7 @@ define void @fcmp_ogt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP UGT
;
define void @fcmp_ugt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
define void @fcmp_ugt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ugt_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -716,7 +459,7 @@ define void @fcmp_ugt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP OLT
;
define void @fcmp_olt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
define void @fcmp_olt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_olt_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -738,7 +481,7 @@ define void @fcmp_olt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP ULT
;
define void @fcmp_ult_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
define void @fcmp_ult_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ult_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -762,7 +505,7 @@ define void @fcmp_ult_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP OGE
;
define void @fcmp_oge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
define void @fcmp_oge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oge_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -784,7 +527,7 @@ define void @fcmp_oge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP UGE
;
define void @fcmp_uge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
define void @fcmp_uge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_uge_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -808,7 +551,7 @@ define void @fcmp_uge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP OLE
;
define void @fcmp_ole_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
define void @fcmp_ole_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ole_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -830,7 +573,7 @@ define void @fcmp_ole_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP ULE
;
define void @fcmp_ule_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
define void @fcmp_ule_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ule_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -854,7 +597,7 @@ define void @fcmp_ule_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP UNO
;
define void @fcmp_uno_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
define void @fcmp_uno_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_uno_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -876,7 +619,7 @@ define void @fcmp_uno_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP ORD
;
define void @fcmp_ord_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
define void @fcmp_ord_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ord_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -900,7 +643,7 @@ define void @fcmp_ord_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP EQ
;
define void @fcmp_eq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
define void @fcmp_eq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_eq_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -922,7 +665,7 @@ define void @fcmp_eq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0
; FCMP NE
;
define void @fcmp_ne_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
define void @fcmp_ne_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ne_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -944,7 +687,7 @@ define void @fcmp_ne_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0
; FCMP GT
;
define void @fcmp_gt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
define void @fcmp_gt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_gt_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -966,7 +709,7 @@ define void @fcmp_gt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0
; FCMP LT
;
define void @fcmp_lt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
define void @fcmp_lt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_lt_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -988,7 +731,7 @@ define void @fcmp_lt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0
; FCMP GE
;
define void @fcmp_ge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
define void @fcmp_ge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ge_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -1010,7 +753,7 @@ define void @fcmp_ge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0
; FCMP LE
;
define void @fcmp_le_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 {
define void @fcmp_le_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_le_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16

View File

@ -1,21 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; Don't use SVE when its registers are no bigger than NEON.
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
@ -24,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
;
; Don't use SVE for 64-bit vectors.
define void @fcvt_v2f16_v2f32(<2 x half>* %a, <2 x float>* %b) #0 {
define void @fcvt_v2f16_v2f32(<2 x half>* %a, <2 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v2f16_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr s0, [x0]
@ -38,7 +24,7 @@ define void @fcvt_v2f16_v2f32(<2 x half>* %a, <2 x float>* %b) #0 {
}
; Don't use SVE for 128-bit vectors.
define void @fcvt_v4f16_v4f32(<4 x half>* %a, <4 x float>* %b) #0 {
define void @fcvt_v4f16_v4f32(<4 x half>* %a, <4 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v4f16_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@ -51,7 +37,7 @@ define void @fcvt_v4f16_v4f32(<4 x half>* %a, <4 x float>* %b) #0 {
ret void
}
define void @fcvt_v8f16_v8f32(<8 x half>* %a, <8 x float>* %b) #0 {
define void @fcvt_v8f16_v8f32(<8 x half>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v8f16_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@ -66,7 +52,6 @@ define void @fcvt_v8f16_v8f32(<8 x half>* %a, <8 x float>* %b) #0 {
}
define void @fcvt_v16f16_v16f32(<16 x half>* %a, <16 x float>* %b) #0 {
; Ensure sensible type legalisation.
; VBITS_GE_256-LABEL: fcvt_v16f16_v16f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
@ -86,91 +71,34 @@ define void @fcvt_v16f16_v16f32(<16 x half>* %a, <16 x float>* %b) #0 {
; VBITS_GE_512-NEXT: fcvt z0.s, p0/m, z0.h
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a
%res = fpext <16 x half> %op1 to <16 x float>
store <16 x float> %res, <16 x float>* %b
ret void
}
define void @fcvt_v32f16_v32f32(<32 x half>* %a, <32 x float>* %b) #0 {
; VBITS_GE_256-LABEL: fcvt_v32f16_v32f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: mov x9, #24
; VBITS_GE_256-NEXT: mov x10, #16
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.s }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.s }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z0.h
; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z1.h
; VBITS_GE_256-NEXT: fcvt z2.s, p0/m, z2.h
; VBITS_GE_256-NEXT: fcvt z3.s, p0/m, z3.h
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: fcvt_v32f16_v32f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1h { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fcvt z0.s, p0/m, z0.h
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @fcvt_v32f16_v32f32(<32 x half>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fcvt_v32f16_v32f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
; CHECK-NEXT: fcvt z0.s, p0/m, z0.h
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%res = fpext <32 x half> %op1 to <32 x float>
store <32 x float> %res, <32 x float>* %b
ret void
}
define void @fcvt_v64f16_v64f32(<64 x half>* %a, <64 x float>* %b) #0 {
; VBITS_GE_256-LABEL: fcvt_v64f16_v64f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: mov x9, #24
; VBITS_GE_256-NEXT: mov x10, #16
; VBITS_GE_256-NEXT: mov x11, #40
; VBITS_GE_256-NEXT: mov x12, #32
; VBITS_GE_256-NEXT: mov x13, #56
; VBITS_GE_256-NEXT: mov x14, #48
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.s }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.s }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.s }, p0/z, [x0, x11, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z4.s }, p0/z, [x0, x12, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z5.s }, p0/z, [x0, x13, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z6.s }, p0/z, [x0, x14, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z7.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z0.h
; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z1.h
; VBITS_GE_256-NEXT: fcvt z2.s, p0/m, z2.h
; VBITS_GE_256-NEXT: fcvt z3.s, p0/m, z3.h
; VBITS_GE_256-NEXT: fcvt z4.s, p0/m, z4.h
; VBITS_GE_256-NEXT: fcvt z5.s, p0/m, z5.h
; VBITS_GE_256-NEXT: fcvt z6.s, p0/m, z6.h
; VBITS_GE_256-NEXT: fcvt z7.s, p0/m, z7.h
; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x14, lsl #2]
; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x13, lsl #2]
; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: fcvt_v64f16_v64f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1h { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fcvt z0.s, p0/m, z0.h
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
define void @fcvt_v64f16_v64f32(<64 x half>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fcvt_v64f16_v64f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
; CHECK-NEXT: fcvt z0.s, p0/m, z0.h
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%res = fpext <64 x half> %op1 to <64 x float>
store <64 x float> %res, <64 x float>* %b
@ -182,7 +110,7 @@ define void @fcvt_v64f16_v64f32(<64 x half>* %a, <64 x float>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
define void @fcvt_v1f16_v1f64(<1 x half>* %a, <1 x double>* %b) #0 {
define void @fcvt_v1f16_v1f64(<1 x half>* %a, <1 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v1f16_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr h0, [x0]
@ -196,7 +124,7 @@ define void @fcvt_v1f16_v1f64(<1 x half>* %a, <1 x double>* %b) #0 {
}
; v2f16 is not legal for NEON, so use SVE
define void @fcvt_v2f16_v2f64(<2 x half>* %a, <2 x double>* %b) #0 {
define void @fcvt_v2f16_v2f64(<2 x half>* %a, <2 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v2f16_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr s0, [x0]
@ -212,7 +140,7 @@ define void @fcvt_v2f16_v2f64(<2 x half>* %a, <2 x double>* %b) #0 {
ret void
}
define void @fcvt_v4f16_v4f64(<4 x half>* %a, <4 x double>* %b) #0 {
define void @fcvt_v4f16_v4f64(<4 x half>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v4f16_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@ -246,91 +174,34 @@ define void @fcvt_v8f16_v8f64(<8 x half>* %a, <8 x double>* %b) #0 {
; VBITS_GE_512-NEXT: fcvt z0.d, p0/m, z0.h
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%op1 = load <8 x half>, <8 x half>* %a
%res = fpext <8 x half> %op1 to <8 x double>
store <8 x double> %res, <8 x double>* %b
ret void
}
define void @fcvt_v16f16_v16f64(<16 x half>* %a, <16 x double>* %b) #0 {
; VBITS_GE_256-LABEL: fcvt_v16f16_v16f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: mov x9, #12
; VBITS_GE_256-NEXT: mov x10, #8
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1h { z0.d }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.d }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.d }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: fcvt z0.d, p0/m, z0.h
; VBITS_GE_256-NEXT: fcvt z1.d, p0/m, z1.h
; VBITS_GE_256-NEXT: fcvt z2.d, p0/m, z2.h
; VBITS_GE_256-NEXT: fcvt z3.d, p0/m, z3.h
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: fcvt_v16f16_v16f64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1h { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fcvt z0.d, p0/m, z0.h
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @fcvt_v16f16_v16f64(<16 x half>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fcvt_v16f16_v16f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
; CHECK-NEXT: fcvt z0.d, p0/m, z0.h
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a
%res = fpext <16 x half> %op1 to <16 x double>
store <16 x double> %res, <16 x double>* %b
ret void
}
define void @fcvt_v32f16_v32f64(<32 x half>* %a, <32 x double>* %b) #0 {
; VBITS_GE_256-LABEL: fcvt_v32f16_v32f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: mov x9, #12
; VBITS_GE_256-NEXT: mov x10, #8
; VBITS_GE_256-NEXT: mov x11, #20
; VBITS_GE_256-NEXT: mov x12, #16
; VBITS_GE_256-NEXT: mov x13, #28
; VBITS_GE_256-NEXT: mov x14, #24
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1h { z0.d }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.d }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.d }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.d }, p0/z, [x0, x11, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z4.d }, p0/z, [x0, x12, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z5.d }, p0/z, [x0, x13, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z6.d }, p0/z, [x0, x14, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z7.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: fcvt z0.d, p0/m, z0.h
; VBITS_GE_256-NEXT: fcvt z1.d, p0/m, z1.h
; VBITS_GE_256-NEXT: fcvt z2.d, p0/m, z2.h
; VBITS_GE_256-NEXT: fcvt z3.d, p0/m, z3.h
; VBITS_GE_256-NEXT: fcvt z4.d, p0/m, z4.h
; VBITS_GE_256-NEXT: fcvt z5.d, p0/m, z5.h
; VBITS_GE_256-NEXT: fcvt z6.d, p0/m, z6.h
; VBITS_GE_256-NEXT: fcvt z7.d, p0/m, z7.h
; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x14, lsl #3]
; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x13, lsl #3]
; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x12, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: fcvt_v32f16_v32f64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1h { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fcvt z0.d, p0/m, z0.h
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
define void @fcvt_v32f16_v32f64(<32 x half>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fcvt_v32f16_v32f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
; CHECK-NEXT: fcvt z0.d, p0/m, z0.h
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%res = fpext <32 x half> %op1 to <32 x double>
store <32 x double> %res, <32 x double>* %b
@ -342,7 +213,7 @@ define void @fcvt_v32f16_v32f64(<32 x half>* %a, <32 x double>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
define void @fcvt_v1f32_v1f64(<1 x float>* %a, <1 x double>* %b) #0 {
define void @fcvt_v1f32_v1f64(<1 x float>* %a, <1 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v1f32_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr s0, [x0]
@ -356,7 +227,7 @@ define void @fcvt_v1f32_v1f64(<1 x float>* %a, <1 x double>* %b) #0 {
}
; Don't use SVE for 128-bit vectors.
define void @fcvt_v2f32_v2f64(<2 x float>* %a, <2 x double>* %b) #0 {
define void @fcvt_v2f32_v2f64(<2 x float>* %a, <2 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v2f32_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@ -369,7 +240,7 @@ define void @fcvt_v2f32_v2f64(<2 x float>* %a, <2 x double>* %b) #0 {
ret void
}
define void @fcvt_v4f32_v4f64(<4 x float>* %a, <4 x double>* %b) #0 {
define void @fcvt_v4f32_v4f64(<4 x float>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v4f32_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@ -384,7 +255,6 @@ define void @fcvt_v4f32_v4f64(<4 x float>* %a, <4 x double>* %b) #0 {
}
define void @fcvt_v8f32_v8f64(<8 x float>* %a, <8 x double>* %b) #0 {
; Ensure sensible type legalisation.
; VBITS_GE_256-LABEL: fcvt_v8f32_v8f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
@ -410,84 +280,28 @@ define void @fcvt_v8f32_v8f64(<8 x float>* %a, <8 x double>* %b) #0 {
ret void
}
define void @fcvt_v16f32_v16f64(<16 x float>* %a, <16 x double>* %b) #0 {
; VBITS_GE_256-LABEL: fcvt_v16f32_v16f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: mov x9, #12
; VBITS_GE_256-NEXT: mov x10, #8
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.d }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.d }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: fcvt z0.d, p0/m, z0.s
; VBITS_GE_256-NEXT: fcvt z1.d, p0/m, z1.s
; VBITS_GE_256-NEXT: fcvt z2.d, p0/m, z2.s
; VBITS_GE_256-NEXT: fcvt z3.d, p0/m, z3.s
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: fcvt_v16f32_v16f64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1w { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fcvt z0.d, p0/m, z0.s
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @fcvt_v16f32_v16f64(<16 x float>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fcvt_v16f32_v16f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
; CHECK-NEXT: fcvt z0.d, p0/m, z0.s
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%res = fpext <16 x float> %op1 to <16 x double>
store <16 x double> %res, <16 x double>* %b
ret void
}
define void @fcvt_v32f32_v32f64(<32 x float>* %a, <32 x double>* %b) #0 {
; VBITS_GE_256-LABEL: fcvt_v32f32_v32f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: mov x9, #12
; VBITS_GE_256-NEXT: mov x10, #8
; VBITS_GE_256-NEXT: mov x11, #20
; VBITS_GE_256-NEXT: mov x12, #16
; VBITS_GE_256-NEXT: mov x13, #28
; VBITS_GE_256-NEXT: mov x14, #24
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.d }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.d }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.d }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z4.d }, p0/z, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.d }, p0/z, [x0, x13, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z6.d }, p0/z, [x0, x14, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z7.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: fcvt z0.d, p0/m, z0.s
; VBITS_GE_256-NEXT: fcvt z1.d, p0/m, z1.s
; VBITS_GE_256-NEXT: fcvt z2.d, p0/m, z2.s
; VBITS_GE_256-NEXT: fcvt z3.d, p0/m, z3.s
; VBITS_GE_256-NEXT: fcvt z4.d, p0/m, z4.s
; VBITS_GE_256-NEXT: fcvt z5.d, p0/m, z5.s
; VBITS_GE_256-NEXT: fcvt z6.d, p0/m, z6.s
; VBITS_GE_256-NEXT: fcvt z7.d, p0/m, z7.s
; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x14, lsl #3]
; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x13, lsl #3]
; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x12, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: fcvt_v32f32_v32f64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1w { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fcvt z0.d, p0/m, z0.s
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
define void @fcvt_v32f32_v32f64(<32 x float>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fcvt_v32f32_v32f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
; CHECK-NEXT: fcvt z0.d, p0/m, z0.s
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%res = fpext <32 x float> %op1 to <32 x double>
store <32 x double> %res, <32 x double>* %b
@ -499,7 +313,7 @@ define void @fcvt_v32f32_v32f64(<32 x float>* %a, <32 x double>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
define void @fcvt_v2f32_v2f16(<2 x float>* %a, <2 x half>* %b) #0 {
define void @fcvt_v2f32_v2f16(<2 x float>* %a, <2 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v2f32_v2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@ -513,7 +327,7 @@ define void @fcvt_v2f32_v2f16(<2 x float>* %a, <2 x half>* %b) #0 {
}
; Don't use SVE for 128-bit vectors.
define void @fcvt_v4f32_v4f16(<4 x float>* %a, <4 x half>* %b) #0 {
define void @fcvt_v4f32_v4f16(<4 x float>* %a, <4 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v4f32_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@ -526,7 +340,7 @@ define void @fcvt_v4f32_v4f16(<4 x float>* %a, <4 x half>* %b) #0 {
ret void
}
define void @fcvt_v8f32_v8f16(<8 x float>* %a, <8 x half>* %b) #0 {
define void @fcvt_v8f32_v8f16(<8 x float>* %a, <8 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v8f32_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@ -541,7 +355,18 @@ define void @fcvt_v8f32_v8f16(<8 x float>* %a, <8 x half>* %b) #0 {
}
define void @fcvt_v16f32_v16f16(<16 x float>* %a, <16 x half>* %b) #0 {
; Ensure sensible type legalisation
; VBITS_GE_256-LABEL: fcvt_v16f32_v16f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z0.s
; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z1.s
; VBITS_GE_256-NEXT: st1h { z0.s }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: fcvt_v16f32_v16f16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
@ -555,90 +380,28 @@ define void @fcvt_v16f32_v16f16(<16 x float>* %a, <16 x half>* %b) #0 {
ret void
}
define void @fcvt_v32f32_v32f16(<32 x float>* %a, <32 x half>* %b) #0 {
; VBITS_GE_256-LABEL: fcvt_v32f32_v32f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #24
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: mov x10, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z1.s
; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z0.s
; VBITS_GE_256-NEXT: fcvt z2.h, p0/m, z2.s
; VBITS_GE_256-NEXT: fcvt z3.h, p0/m, z3.s
; VBITS_GE_256-NEXT: st1h { z1.s }, p0, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.s }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z2.s }, p0, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z3.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: fcvt_v32f32_v32f16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fcvt z0.h, p0/m, z0.s
; VBITS_GE_1024-NEXT: st1h { z0.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @fcvt_v32f32_v32f16(<32 x float>* %a, <32 x half>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fcvt_v32f32_v32f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: fcvt z0.h, p0/m, z0.s
; CHECK-NEXT: st1h { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%res = fptrunc <32 x float> %op1 to <32 x half>
store <32 x half> %res, <32 x half>* %b
ret void
}
define void @fcvt_v64f32_v64f16(<64 x float>* %a, <64 x half>* %b) #0 {
; VBITS_GE_256-LABEL: fcvt_v64f32_v64f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #56
; VBITS_GE_256-NEXT: mov x10, #48
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: mov x11, #24
; VBITS_GE_256-NEXT: mov x12, #16
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: mov x13, #40
; VBITS_GE_256-NEXT: mov x14, #32
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z1.s
; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z0.s
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x13, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x14, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1h { z1.s }, p0, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.s }, p0, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: movprfx z0, z5
; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z5.s
; VBITS_GE_256-NEXT: movprfx z1, z4
; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z4.s
; VBITS_GE_256-NEXT: st1h { z0.s }, p0, [x1, x14, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.s }, p0, [x1, x13, lsl #1]
; VBITS_GE_256-NEXT: movprfx z0, z6
; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z6.s
; VBITS_GE_256-NEXT: movprfx z1, z3
; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z3.s
; VBITS_GE_256-NEXT: st1h { z0.s }, p0, [x1, x12, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.s }, p0, [x1, x11, lsl #1]
; VBITS_GE_256-NEXT: movprfx z0, z2
; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z2.s
; VBITS_GE_256-NEXT: movprfx z1, z7
; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z7.s
; VBITS_GE_256-NEXT: st1h { z0.s }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: fcvt_v64f32_v64f16:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fcvt z0.h, p0/m, z0.s
; VBITS_GE_2048-NEXT: st1h { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
define void @fcvt_v64f32_v64f16(<64 x float>* %a, <64 x half>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fcvt_v64f32_v64f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: fcvt z0.h, p0/m, z0.s
; CHECK-NEXT: st1h { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%res = fptrunc <64 x float> %op1 to <64 x half>
store <64 x half> %res, <64 x half>* %b
@ -650,7 +413,7 @@ define void @fcvt_v64f32_v64f16(<64 x float>* %a, <64 x half>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
define void @fcvt_v1f64_v1f16(<1 x double>* %a, <1 x half>* %b) #0 {
define void @fcvt_v1f64_v1f16(<1 x double>* %a, <1 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v1f64_v1f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@ -664,7 +427,7 @@ define void @fcvt_v1f64_v1f16(<1 x double>* %a, <1 x half>* %b) #0 {
}
; v2f16 is not legal for NEON, so use SVE
define void @fcvt_v2f64_v2f16(<2 x double>* %a, <2 x half>* %b) #0 {
define void @fcvt_v2f64_v2f16(<2 x double>* %a, <2 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v2f64_v2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@ -680,7 +443,7 @@ define void @fcvt_v2f64_v2f16(<2 x double>* %a, <2 x half>* %b) #0 {
ret void
}
define void @fcvt_v4f64_v4f16(<4 x double>* %a, <4 x half>* %b) #0 {
define void @fcvt_v4f64_v4f16(<4 x double>* %a, <4 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v4f64_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@ -695,7 +458,6 @@ define void @fcvt_v4f64_v4f16(<4 x double>* %a, <4 x half>* %b) #0 {
}
define void @fcvt_v8f64_v8f16(<8 x double>* %a, <8 x half>* %b) #0 {
; Ensure sensible type legalisation
; VBITS_GE_256-LABEL: fcvt_v8f64_v8f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
@ -726,70 +488,28 @@ define void @fcvt_v8f64_v8f16(<8 x double>* %a, <8 x half>* %b) #0 {
ret void
}
define void @fcvt_v16f64_v16f16(<16 x double>* %a, <16 x half>* %b) #0 {
; VBITS_GE_1024-LABEL: fcvt_v16f64_v16f16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fcvt z0.h, p0/m, z0.d
; VBITS_GE_1024-NEXT: st1h { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @fcvt_v16f64_v16f16(<16 x double>* %a, <16 x half>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fcvt_v16f64_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: fcvt z0.h, p0/m, z0.d
; CHECK-NEXT: st1h { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%res = fptrunc <16 x double> %op1 to <16 x half>
store <16 x half> %res, <16 x half>* %b
ret void
}
define void @fcvt_v32f64_v32f16(<32 x double>* %a, <32 x half>* %b) #0 {
; VBITS_GE_256-LABEL: fcvt_v32f64_v32f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #28
; VBITS_GE_256-NEXT: mov x10, #24
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: mov x11, #12
; VBITS_GE_256-NEXT: mov x12, #8
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: mov x13, #20
; VBITS_GE_256-NEXT: mov x14, #16
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x12, lsl #3]
; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z1.d
; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z0.d
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x13, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1h { z1.d }, p0, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: movprfx z0, z5
; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z5.d
; VBITS_GE_256-NEXT: movprfx z1, z4
; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z4.d
; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [x1, x14, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.d }, p0, [x1, x13, lsl #1]
; VBITS_GE_256-NEXT: movprfx z0, z6
; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z6.d
; VBITS_GE_256-NEXT: movprfx z1, z3
; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z3.d
; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [x1, x12, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.d }, p0, [x1, x11, lsl #1]
; VBITS_GE_256-NEXT: movprfx z0, z2
; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z2.d
; VBITS_GE_256-NEXT: movprfx z1, z7
; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z7.d
; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: fcvt_v32f64_v32f16:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fcvt z0.h, p0/m, z0.d
; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
define void @fcvt_v32f64_v32f16(<32 x double>* %a, <32 x half>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fcvt_v32f64_v32f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: fcvt z0.h, p0/m, z0.d
; CHECK-NEXT: st1h { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%res = fptrunc <32 x double> %op1 to <32 x half>
store <32 x half> %res, <32 x half>* %b
@ -801,7 +521,7 @@ define void @fcvt_v32f64_v32f16(<32 x double>* %a, <32 x half>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
define void @fcvt_v1f64_v1f32(<1 x double> %op1, <1 x float>* %b) #0 {
define void @fcvt_v1f64_v1f32(<1 x double> %op1, <1 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v1f64_v1f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@ -814,7 +534,7 @@ define void @fcvt_v1f64_v1f32(<1 x double> %op1, <1 x float>* %b) #0 {
}
; Don't use SVE for 128-bit vectors.
define void @fcvt_v2f64_v2f32(<2 x double> %op1, <2 x float>* %b) #0 {
define void @fcvt_v2f64_v2f32(<2 x double> %op1, <2 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v2f64_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fcvtn v0.2s, v0.2d
@ -825,7 +545,7 @@ define void @fcvt_v2f64_v2f32(<2 x double> %op1, <2 x float>* %b) #0 {
ret void
}
define void @fcvt_v4f64_v4f32(<4 x double>* %a, <4 x float>* %b) #0 {
define void @fcvt_v4f64_v4f32(<4 x double>* %a, <4 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v4f64_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@ -840,7 +560,18 @@ define void @fcvt_v4f64_v4f32(<4 x double>* %a, <4 x float>* %b) #0 {
}
define void @fcvt_v8f64_v8f32(<8 x double>* %a, <8 x float>* %b) #0 {
; Ensure sensible type legalisation
; VBITS_GE_256-LABEL: fcvt_v8f64_v8f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z0.d
; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z1.d
; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: fcvt_v8f64_v8f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
@ -854,90 +585,28 @@ define void @fcvt_v8f64_v8f32(<8 x double>* %a, <8 x float>* %b) #0 {
ret void
}
define void @fcvt_v16f64_v16f32(<16 x double>* %a, <16 x float>* %b) #0 {
; VBITS_GE_256-LABEL: fcvt_v16f64_v16f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #12
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: mov x10, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z1.d
; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z0.d
; VBITS_GE_256-NEXT: fcvt z2.s, p0/m, z2.d
; VBITS_GE_256-NEXT: fcvt z3.s, p0/m, z3.d
; VBITS_GE_256-NEXT: st1w { z1.d }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.d }, p0, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: fcvt_v16f64_v16f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fcvt z0.s, p0/m, z0.d
; VBITS_GE_1024-NEXT: st1w { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @fcvt_v16f64_v16f32(<16 x double>* %a, <16 x float>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: fcvt_v16f64_v16f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: fcvt z0.s, p0/m, z0.d
; CHECK-NEXT: st1w { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%res = fptrunc <16 x double> %op1 to <16 x float>
store <16 x float> %res, <16 x float>* %b
ret void
}
define void @fcvt_v32f64_v32f32(<32 x double>* %a, <32 x float>* %b) #0 {
; VBITS_GE_256-LABEL: fcvt_v32f64_v32f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #28
; VBITS_GE_256-NEXT: mov x10, #24
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: mov x11, #12
; VBITS_GE_256-NEXT: mov x12, #8
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: mov x13, #20
; VBITS_GE_256-NEXT: mov x14, #16
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x12, lsl #3]
; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z1.d
; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z0.d
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x13, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1w { z1.d }, p0, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: movprfx z0, z5
; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z5.d
; VBITS_GE_256-NEXT: movprfx z1, z4
; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z4.d
; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [x1, x14, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.d }, p0, [x1, x13, lsl #2]
; VBITS_GE_256-NEXT: movprfx z0, z6
; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z6.d
; VBITS_GE_256-NEXT: movprfx z1, z3
; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z3.d
; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [x1, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.d }, p0, [x1, x11, lsl #2]
; VBITS_GE_256-NEXT: movprfx z0, z2
; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z2.d
; VBITS_GE_256-NEXT: movprfx z1, z7
; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z7.d
; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: fcvt_v32f64_v32f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fcvt z0.s, p0/m, z0.d
; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
define void @fcvt_v32f64_v32f32(<32 x double>* %a, <32 x float>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: fcvt_v32f64_v32f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: fcvt z0.s, p0/m, z0.d
; CHECK-NEXT: st1w { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%res = fptrunc <32 x double> %op1 to <32 x float>
store <32 x float> %res, <32 x float>* %b

View File

@ -1,5 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -O3 -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s
; RUN: llc -O3 -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -O3 -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -O3 -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
@ -8,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
;
; Don't use SVE for 64-bit vectors.
define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) #0 {
define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: fmla v2.4h, v0.4h, v1.4h
@ -20,7 +22,7 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
}
; Don't use SVE for 128-bit vectors.
define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) #0 {
define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: fmla v2.8h, v0.8h, v1.8h
@ -31,7 +33,7 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
ret <8 x half> %res
}
define void @fma_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x half>* %c) #0 {
define void @fma_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x half>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -51,15 +53,31 @@ define void @fma_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x half>* %c) #0 {
}
define void @fma_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x half>* %c) #0 {
; CHECK-LABEL: fma_v32f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl32
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: ld1h { z2.h }, p0/z, [x2]
; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
; VBITS_GE_256-LABEL: fma_v32f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x2, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x2]
; VBITS_GE_256-NEXT: fmad z0.h, p0/m, z2.h, z4.h
; VBITS_GE_256-NEXT: fmad z1.h, p0/m, z3.h, z5.h
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: fma_v32f16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_512-NEXT: ld1h { z2.h }, p0/z, [x2]
; VBITS_GE_512-NEXT: fmad z0.h, p0/m, z1.h, z2.h
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%op2 = load <32 x half>, <32 x half>* %b
%op3 = load <32 x half>, <32 x half>* %c
@ -69,7 +87,7 @@ define void @fma_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x half>* %c) #0 {
ret void
}
define void @fma_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x half>* %c) #0 {
define void @fma_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x half>* %c) vscale_range(8,0) #0 {
; CHECK-LABEL: fma_v64f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64
@ -88,7 +106,7 @@ define void @fma_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x half>* %c) #0 {
ret void
}
define void @fma_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x half>* %c) #0 {
define void @fma_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x half>* %c) vscale_range(16,0) #0 {
; CHECK-LABEL: fma_v128f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
@ -108,7 +126,7 @@ define void @fma_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x half>* %c) #
}
; Don't use SVE for 64-bit vectors.
define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3) #0 {
define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fmla v2.2s, v0.2s, v1.2s
@ -120,7 +138,7 @@ define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %o
}
; Don't use SVE for 128-bit vectors.
define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3) #0 {
define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fmla v2.4s, v0.4s, v1.4s
@ -131,7 +149,7 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o
ret <4 x float> %res
}
define void @fma_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) #0 {
define void @fma_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@ -151,15 +169,31 @@ define void @fma_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) #0 {
}
define void @fma_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x float>* %c) #0 {
; CHECK-LABEL: fma_v16f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl16
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2]
; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
; VBITS_GE_256-LABEL: fma_v16f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x2, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x2]
; VBITS_GE_256-NEXT: fmad z0.s, p0/m, z2.s, z4.s
; VBITS_GE_256-NEXT: fmad z1.s, p0/m, z3.s, z5.s
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: fma_v16f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x2]
; VBITS_GE_512-NEXT: fmad z0.s, p0/m, z1.s, z2.s
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%op2 = load <16 x float>, <16 x float>* %b
%op3 = load <16 x float>, <16 x float>* %c
@ -169,7 +203,7 @@ define void @fma_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x float>* %c) #0
ret void
}
define void @fma_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x float>* %c) #0 {
define void @fma_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x float>* %c) vscale_range(8,0) #0 {
; CHECK-LABEL: fma_v32f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
@ -188,7 +222,7 @@ define void @fma_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x float>* %c) #0
ret void
}
define void @fma_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x float>* %c) #0 {
define void @fma_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x float>* %c) vscale_range(16,0) #0 {
; CHECK-LABEL: fma_v64f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
@ -208,7 +242,7 @@ define void @fma_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x float>* %c) #0
}
; Don't use SVE for 64-bit vectors.
define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double> %op3) #0 {
define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: fmadd d0, d0, d1, d2
@ -219,7 +253,7 @@ define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double
}
; Don't use SVE for 128-bit vectors.
define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3) #0 {
define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: fmla v2.2d, v0.2d, v1.2d
@ -230,7 +264,7 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double
ret <2 x double> %res
}
define void @fma_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x double>* %c) #0 {
define void @fma_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x double>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@ -250,15 +284,31 @@ define void @fma_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x double>* %c) #0
}
define void @fma_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x double>* %c) #0 {
; CHECK-LABEL: fma_v8f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl8
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2]
; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
; VBITS_GE_256-LABEL: fma_v8f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x2, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x2]
; VBITS_GE_256-NEXT: fmad z0.d, p0/m, z2.d, z4.d
; VBITS_GE_256-NEXT: fmad z1.d, p0/m, z3.d, z5.d
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: fma_v8f64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x2]
; VBITS_GE_512-NEXT: fmad z0.d, p0/m, z1.d, z2.d
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a
%op2 = load <8 x double>, <8 x double>* %b
%op3 = load <8 x double>, <8 x double>* %c
@ -268,7 +318,7 @@ define void @fma_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x double>* %c) #0
ret void
}
define void @fma_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x double>* %c) #0 {
define void @fma_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x double>* %c) vscale_range(8,0) #0 {
; CHECK-LABEL: fma_v16f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
@ -287,7 +337,7 @@ define void @fma_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x double>* %c)
ret void
}
define void @fma_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x double>* %c) #0 {
define void @fma_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x double>* %c) vscale_range(16,0) #0 {
; CHECK-LABEL: fma_v32f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,36 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE when its registers are no bigger than NEON.
; NO_SVE-NOT: ptrue
; Don't use SVE for 64-bit vectors.
define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) #0 {
; NO_SVE-LABEL: select_v4f16:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: tst w0, #0x1
; NO_SVE-NEXT: csetm w8, ne
; NO_SVE-NEXT: dup v2.4h, w8
; NO_SVE-NEXT: bif v0.8b, v1.8b, v2.8b
; NO_SVE-NEXT: ret
;
define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
@ -43,15 +19,7 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) #0 {
}
; Don't use SVE for 128-bit vectors.
define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) #0 {
; NO_SVE-LABEL: select_v8f16:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: tst w0, #0x1
; NO_SVE-NEXT: csetm w8, ne
; NO_SVE-NEXT: dup v2.8h, w8
; NO_SVE-NEXT: bif v0.16b, v1.16b, v2.16b
; NO_SVE-NEXT: ret
;
define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
@ -63,21 +31,7 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) #0 {
ret <8 x half> %sel
}
define void @select_v16f16(<16 x half>* %a, <16 x half>* %b, i1 %mask) #0 {
; NO_SVE-LABEL: select_v16f16:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: tst w2, #0x1
; NO_SVE-NEXT: ldr q0, [x0]
; NO_SVE-NEXT: csetm w8, ne
; NO_SVE-NEXT: ldr q1, [x0, #16]
; NO_SVE-NEXT: ldr q2, [x1]
; NO_SVE-NEXT: ldr q3, [x1, #16]
; NO_SVE-NEXT: dup v4.8h, w8
; NO_SVE-NEXT: bif v0.16b, v2.16b, v4.16b
; NO_SVE-NEXT: bif v1.16b, v3.16b, v4.16b
; NO_SVE-NEXT: stp q0, q1, [x0]
; NO_SVE-NEXT: ret
;
define void @select_v16f16(<16 x half>* %a, <16 x half>* %b, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1
@ -99,26 +53,24 @@ define void @select_v16f16(<16 x half>* %a, <16 x half>* %b, i1 %mask) #0 {
}
define void @select_v32f16(<32 x half>* %a, <32 x half>* %b, i1 %mask) #0 {
; NO_SVE-LABEL: select_v32f16:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: tst w2, #0x1
; NO_SVE-NEXT: ldr q0, [x0, #48]
; NO_SVE-NEXT: csetm w8, ne
; NO_SVE-NEXT: ldr q1, [x0]
; NO_SVE-NEXT: ldr q2, [x0, #16]
; NO_SVE-NEXT: ldr q3, [x0, #32]
; NO_SVE-NEXT: ldr q4, [x1, #48]
; NO_SVE-NEXT: dup v6.8h, w8
; NO_SVE-NEXT: ldr q5, [x1]
; NO_SVE-NEXT: ldr q7, [x1, #16]
; NO_SVE-NEXT: ldr q16, [x1, #32]
; NO_SVE-NEXT: bif v1.16b, v5.16b, v6.16b
; NO_SVE-NEXT: bif v2.16b, v7.16b, v6.16b
; NO_SVE-NEXT: bif v0.16b, v4.16b, v6.16b
; NO_SVE-NEXT: bif v3.16b, v16.16b, v6.16b
; NO_SVE-NEXT: stp q1, q2, [x0]
; NO_SVE-NEXT: stp q3, q0, [x0, #32]
; NO_SVE-NEXT: ret
; VBITS_GE_256-LABEL: select_v32f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: and w9, w2, #0x1
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ptrue p1.h
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: mov z4.h, w9
; VBITS_GE_256-NEXT: and z4.h, z4.h, #0x1
; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z4.h, #0
; VBITS_GE_256-NEXT: sel z1.h, p1, z1.h, z3.h
; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z2.h
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: select_v32f16:
; VBITS_GE_512: // %bb.0:
@ -140,58 +92,20 @@ define void @select_v32f16(<32 x half>* %a, <32 x half>* %b, i1 %mask) #0 {
ret void
}
define void @select_v64f16(<64 x half>* %a, <64 x half>* %b, i1 %mask) #0 {
; NO_SVE-LABEL: select_v64f16:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: tst w2, #0x1
; NO_SVE-NEXT: ldr q0, [x0, #16]
; NO_SVE-NEXT: csetm w8, ne
; NO_SVE-NEXT: ldr q1, [x0]
; NO_SVE-NEXT: ldr q2, [x0, #48]
; NO_SVE-NEXT: ldr q3, [x0, #32]
; NO_SVE-NEXT: ldr q4, [x0, #80]
; NO_SVE-NEXT: dup v21.8h, w8
; NO_SVE-NEXT: ldr q5, [x0, #64]
; NO_SVE-NEXT: ldr q6, [x0, #112]
; NO_SVE-NEXT: ldr q7, [x0, #96]
; NO_SVE-NEXT: ldr q16, [x1, #16]
; NO_SVE-NEXT: ldr q17, [x1]
; NO_SVE-NEXT: ldr q18, [x1, #48]
; NO_SVE-NEXT: ldr q19, [x1, #32]
; NO_SVE-NEXT: bif v0.16b, v16.16b, v21.16b
; NO_SVE-NEXT: ldr q20, [x1, #80]
; NO_SVE-NEXT: bif v1.16b, v17.16b, v21.16b
; NO_SVE-NEXT: ldr q16, [x1, #64]
; NO_SVE-NEXT: bif v2.16b, v18.16b, v21.16b
; NO_SVE-NEXT: ldr q17, [x1, #112]
; NO_SVE-NEXT: bif v3.16b, v19.16b, v21.16b
; NO_SVE-NEXT: ldr q18, [x1, #96]
; NO_SVE-NEXT: bif v4.16b, v20.16b, v21.16b
; NO_SVE-NEXT: stp q1, q0, [x0]
; NO_SVE-NEXT: mov v0.16b, v21.16b
; NO_SVE-NEXT: mov v1.16b, v21.16b
; NO_SVE-NEXT: stp q3, q2, [x0, #32]
; NO_SVE-NEXT: mov v2.16b, v21.16b
; NO_SVE-NEXT: bsl v0.16b, v5.16b, v16.16b
; NO_SVE-NEXT: bsl v1.16b, v6.16b, v17.16b
; NO_SVE-NEXT: bsl v2.16b, v7.16b, v18.16b
; NO_SVE-NEXT: stp q0, q4, [x0, #64]
; NO_SVE-NEXT: stp q2, q1, [x0, #96]
; NO_SVE-NEXT: ret
;
; VBITS_GE_1024-LABEL: select_v64f16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: and w8, w2, #0x1
; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_1024-NEXT: ptrue p1.h
; VBITS_GE_1024-NEXT: mov z2.h, w8
; VBITS_GE_1024-NEXT: and z2.h, z2.h, #0x1
; VBITS_GE_1024-NEXT: cmpne p1.h, p1/z, z2.h, #0
; VBITS_GE_1024-NEXT: sel z0.h, p1, z0.h, z1.h
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
define void @select_v64f16(<64 x half>* %a, <64 x half>* %b, i1 %mask) vscale_range(8,0) #0 {
; CHECK-LABEL: select_v64f16:
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: ptrue p1.h
; CHECK-NEXT: mov z2.h, w8
; CHECK-NEXT: and z2.h, z2.h, #0x1
; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <64 x half>, <64 x half>* %a
%op2 = load volatile <64 x half>, <64 x half>* %b
%sel = select i1 %mask, <64 x half> %op1, <64 x half> %op2
@ -199,103 +113,20 @@ define void @select_v64f16(<64 x half>* %a, <64 x half>* %b, i1 %mask) #0 {
ret void
}
define void @select_v128f16(<128 x half>* %a, <128 x half>* %b, i1 %mask) #0 {
; NO_SVE-LABEL: select_v128f16:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: stp d11, d10, [sp, #-32]! // 16-byte Folded Spill
; NO_SVE-NEXT: .cfi_def_cfa_offset 32
; NO_SVE-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill
; NO_SVE-NEXT: .cfi_offset b8, -8
; NO_SVE-NEXT: .cfi_offset b9, -16
; NO_SVE-NEXT: .cfi_offset b10, -24
; NO_SVE-NEXT: .cfi_offset b11, -32
; NO_SVE-NEXT: tst w2, #0x1
; NO_SVE-NEXT: ldr q0, [x0, #240]
; NO_SVE-NEXT: csetm w8, ne
; NO_SVE-NEXT: ldr q1, [x0, #224]
; NO_SVE-NEXT: ldr q2, [x0, #208]
; NO_SVE-NEXT: ldr q3, [x0, #192]
; NO_SVE-NEXT: ldr q4, [x0, #176]
; NO_SVE-NEXT: dup v8.8h, w8
; NO_SVE-NEXT: ldr q5, [x0, #160]
; NO_SVE-NEXT: ldr q6, [x0, #144]
; NO_SVE-NEXT: ldr q7, [x0, #128]
; NO_SVE-NEXT: ldr q16, [x0, #112]
; NO_SVE-NEXT: ldr q17, [x0, #96]
; NO_SVE-NEXT: ldr q18, [x0, #80]
; NO_SVE-NEXT: ldr q19, [x0, #64]
; NO_SVE-NEXT: ldr q20, [x0, #48]
; NO_SVE-NEXT: ldr q21, [x0, #32]
; NO_SVE-NEXT: ldr q22, [x0, #16]
; NO_SVE-NEXT: ldr q23, [x0]
; NO_SVE-NEXT: ldr q24, [x1, #240]
; NO_SVE-NEXT: ldr q25, [x1, #224]
; NO_SVE-NEXT: ldr q26, [x1, #208]
; NO_SVE-NEXT: ldr q27, [x1, #192]
; NO_SVE-NEXT: bif v0.16b, v24.16b, v8.16b
; NO_SVE-NEXT: ldr q28, [x1, #176]
; NO_SVE-NEXT: bif v1.16b, v25.16b, v8.16b
; NO_SVE-NEXT: ldr q29, [x1, #160]
; NO_SVE-NEXT: bif v2.16b, v26.16b, v8.16b
; NO_SVE-NEXT: ldr q30, [x1, #144]
; NO_SVE-NEXT: bif v3.16b, v27.16b, v8.16b
; NO_SVE-NEXT: ldr q31, [x1, #128]
; NO_SVE-NEXT: ldr q9, [x1, #112]
; NO_SVE-NEXT: ldr q10, [x1, #96]
; NO_SVE-NEXT: bif v4.16b, v28.16b, v8.16b
; NO_SVE-NEXT: ldr q28, [x1, #80]
; NO_SVE-NEXT: ldr q24, [x1, #64]
; NO_SVE-NEXT: ldr q25, [x1, #48]
; NO_SVE-NEXT: ldr q26, [x1, #32]
; NO_SVE-NEXT: ldr q27, [x1, #16]
; NO_SVE-NEXT: ldr q11, [x1]
; NO_SVE-NEXT: stp q3, q2, [x0, #192]
; NO_SVE-NEXT: stp q1, q0, [x0, #224]
; NO_SVE-NEXT: mov v0.16b, v8.16b
; NO_SVE-NEXT: mov v1.16b, v8.16b
; NO_SVE-NEXT: mov v2.16b, v8.16b
; NO_SVE-NEXT: bsl v0.16b, v5.16b, v29.16b
; NO_SVE-NEXT: bsl v1.16b, v6.16b, v30.16b
; NO_SVE-NEXT: bsl v2.16b, v7.16b, v31.16b
; NO_SVE-NEXT: mov v3.16b, v8.16b
; NO_SVE-NEXT: stp q0, q4, [x0, #160]
; NO_SVE-NEXT: mov v4.16b, v8.16b
; NO_SVE-NEXT: mov v0.16b, v8.16b
; NO_SVE-NEXT: stp q2, q1, [x0, #128]
; NO_SVE-NEXT: mov v1.16b, v8.16b
; NO_SVE-NEXT: bsl v3.16b, v16.16b, v9.16b
; NO_SVE-NEXT: bsl v4.16b, v17.16b, v10.16b
; NO_SVE-NEXT: bsl v0.16b, v18.16b, v28.16b
; NO_SVE-NEXT: bsl v1.16b, v19.16b, v24.16b
; NO_SVE-NEXT: mov v2.16b, v8.16b
; NO_SVE-NEXT: stp q4, q3, [x0, #96]
; NO_SVE-NEXT: mov v3.16b, v8.16b
; NO_SVE-NEXT: mov v4.16b, v8.16b
; NO_SVE-NEXT: stp q1, q0, [x0, #64]
; NO_SVE-NEXT: mov v0.16b, v8.16b
; NO_SVE-NEXT: bsl v2.16b, v20.16b, v25.16b
; NO_SVE-NEXT: bsl v3.16b, v21.16b, v26.16b
; NO_SVE-NEXT: bsl v4.16b, v22.16b, v27.16b
; NO_SVE-NEXT: bsl v0.16b, v23.16b, v11.16b
; NO_SVE-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload
; NO_SVE-NEXT: stp q3, q2, [x0, #32]
; NO_SVE-NEXT: stp q0, q4, [x0]
; NO_SVE-NEXT: ldp d11, d10, [sp], #32 // 16-byte Folded Reload
; NO_SVE-NEXT: ret
;
; VBITS_GE_2048-LABEL: select_v128f16:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: and w8, w2, #0x1
; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_2048-NEXT: ptrue p1.h
; VBITS_GE_2048-NEXT: mov z2.h, w8
; VBITS_GE_2048-NEXT: and z2.h, z2.h, #0x1
; VBITS_GE_2048-NEXT: cmpne p1.h, p1/z, z2.h, #0
; VBITS_GE_2048-NEXT: sel z0.h, p1, z0.h, z1.h
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
define void @select_v128f16(<128 x half>* %a, <128 x half>* %b, i1 %mask) vscale_range(16,0) #0 {
; CHECK-LABEL: select_v128f16:
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: ptrue p1.h
; CHECK-NEXT: mov z2.h, w8
; CHECK-NEXT: and z2.h, z2.h, #0x1
; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <128 x half>, <128 x half>* %a
%op2 = load volatile <128 x half>, <128 x half>* %b
%sel = select i1 %mask, <128 x half> %op1, <128 x half> %op2
@ -304,15 +135,7 @@ define void @select_v128f16(<128 x half>* %a, <128 x half>* %b, i1 %mask) #0 {
}
; Don't use SVE for 64-bit vectors.
define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) #0 {
; NO_SVE-LABEL: select_v2f32:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: tst w0, #0x1
; NO_SVE-NEXT: csetm w8, ne
; NO_SVE-NEXT: dup v2.2s, w8
; NO_SVE-NEXT: bif v0.8b, v1.8b, v2.8b
; NO_SVE-NEXT: ret
;
define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
@ -325,15 +148,7 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) #
}
; Don't use SVE for 128-bit vectors.
define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) #0 {
; NO_SVE-LABEL: select_v4f32:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: tst w0, #0x1
; NO_SVE-NEXT: csetm w8, ne
; NO_SVE-NEXT: dup v2.4s, w8
; NO_SVE-NEXT: bif v0.16b, v1.16b, v2.16b
; NO_SVE-NEXT: ret
;
define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
@ -345,21 +160,7 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) #
ret <4 x float> %sel
}
define void @select_v8f32(<8 x float>* %a, <8 x float>* %b, i1 %mask) #0 {
; NO_SVE-LABEL: select_v8f32:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: tst w2, #0x1
; NO_SVE-NEXT: ldr q0, [x0]
; NO_SVE-NEXT: csetm w8, ne
; NO_SVE-NEXT: ldr q1, [x0, #16]
; NO_SVE-NEXT: ldr q2, [x1]
; NO_SVE-NEXT: ldr q3, [x1, #16]
; NO_SVE-NEXT: dup v4.4s, w8
; NO_SVE-NEXT: bif v0.16b, v2.16b, v4.16b
; NO_SVE-NEXT: bif v1.16b, v3.16b, v4.16b
; NO_SVE-NEXT: stp q0, q1, [x0]
; NO_SVE-NEXT: ret
;
define void @select_v8f32(<8 x float>* %a, <8 x float>* %b, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1
@ -381,26 +182,24 @@ define void @select_v8f32(<8 x float>* %a, <8 x float>* %b, i1 %mask) #0 {
}
define void @select_v16f32(<16 x float>* %a, <16 x float>* %b, i1 %mask) #0 {
; NO_SVE-LABEL: select_v16f32:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: tst w2, #0x1
; NO_SVE-NEXT: ldr q0, [x0, #48]
; NO_SVE-NEXT: csetm w8, ne
; NO_SVE-NEXT: ldr q1, [x0]
; NO_SVE-NEXT: ldr q2, [x0, #16]
; NO_SVE-NEXT: ldr q3, [x0, #32]
; NO_SVE-NEXT: ldr q4, [x1, #48]
; NO_SVE-NEXT: dup v6.4s, w8
; NO_SVE-NEXT: ldr q5, [x1]
; NO_SVE-NEXT: ldr q7, [x1, #16]
; NO_SVE-NEXT: ldr q16, [x1, #32]
; NO_SVE-NEXT: bif v1.16b, v5.16b, v6.16b
; NO_SVE-NEXT: bif v2.16b, v7.16b, v6.16b
; NO_SVE-NEXT: bif v0.16b, v4.16b, v6.16b
; NO_SVE-NEXT: bif v3.16b, v16.16b, v6.16b
; NO_SVE-NEXT: stp q1, q2, [x0]
; NO_SVE-NEXT: stp q3, q0, [x0, #32]
; NO_SVE-NEXT: ret
; VBITS_GE_256-LABEL: select_v16f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: and w9, w2, #0x1
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ptrue p1.s
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: mov z4.s, w9
; VBITS_GE_256-NEXT: and z4.s, z4.s, #0x1
; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z4.s, #0
; VBITS_GE_256-NEXT: sel z1.s, p1, z1.s, z3.s
; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z2.s
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: select_v16f32:
; VBITS_GE_512: // %bb.0:
@ -422,58 +221,20 @@ define void @select_v16f32(<16 x float>* %a, <16 x float>* %b, i1 %mask) #0 {
ret void
}
define void @select_v32f32(<32 x float>* %a, <32 x float>* %b, i1 %mask) #0 {
; NO_SVE-LABEL: select_v32f32:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: tst w2, #0x1
; NO_SVE-NEXT: ldr q0, [x0, #16]
; NO_SVE-NEXT: csetm w8, ne
; NO_SVE-NEXT: ldr q1, [x0]
; NO_SVE-NEXT: ldr q2, [x0, #48]
; NO_SVE-NEXT: ldr q3, [x0, #32]
; NO_SVE-NEXT: ldr q4, [x0, #80]
; NO_SVE-NEXT: dup v21.4s, w8
; NO_SVE-NEXT: ldr q5, [x0, #64]
; NO_SVE-NEXT: ldr q6, [x0, #112]
; NO_SVE-NEXT: ldr q7, [x0, #96]
; NO_SVE-NEXT: ldr q16, [x1, #16]
; NO_SVE-NEXT: ldr q17, [x1]
; NO_SVE-NEXT: ldr q18, [x1, #48]
; NO_SVE-NEXT: ldr q19, [x1, #32]
; NO_SVE-NEXT: bif v0.16b, v16.16b, v21.16b
; NO_SVE-NEXT: ldr q20, [x1, #80]
; NO_SVE-NEXT: bif v1.16b, v17.16b, v21.16b
; NO_SVE-NEXT: ldr q16, [x1, #64]
; NO_SVE-NEXT: bif v2.16b, v18.16b, v21.16b
; NO_SVE-NEXT: ldr q17, [x1, #112]
; NO_SVE-NEXT: bif v3.16b, v19.16b, v21.16b
; NO_SVE-NEXT: ldr q18, [x1, #96]
; NO_SVE-NEXT: bif v4.16b, v20.16b, v21.16b
; NO_SVE-NEXT: stp q1, q0, [x0]
; NO_SVE-NEXT: mov v0.16b, v21.16b
; NO_SVE-NEXT: mov v1.16b, v21.16b
; NO_SVE-NEXT: stp q3, q2, [x0, #32]
; NO_SVE-NEXT: mov v2.16b, v21.16b
; NO_SVE-NEXT: bsl v0.16b, v5.16b, v16.16b
; NO_SVE-NEXT: bsl v1.16b, v6.16b, v17.16b
; NO_SVE-NEXT: bsl v2.16b, v7.16b, v18.16b
; NO_SVE-NEXT: stp q0, q4, [x0, #64]
; NO_SVE-NEXT: stp q2, q1, [x0, #96]
; NO_SVE-NEXT: ret
;
; VBITS_GE_1024-LABEL: select_v32f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: and w8, w2, #0x1
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_1024-NEXT: ptrue p1.s
; VBITS_GE_1024-NEXT: mov z2.s, w8
; VBITS_GE_1024-NEXT: and z2.s, z2.s, #0x1
; VBITS_GE_1024-NEXT: cmpne p1.s, p1/z, z2.s, #0
; VBITS_GE_1024-NEXT: sel z0.s, p1, z0.s, z1.s
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
define void @select_v32f32(<32 x float>* %a, <32 x float>* %b, i1 %mask) vscale_range(8,0) #0 {
; CHECK-LABEL: select_v32f32:
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: ptrue p1.s
; CHECK-NEXT: mov z2.s, w8
; CHECK-NEXT: and z2.s, z2.s, #0x1
; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <32 x float>, <32 x float>* %a
%op2 = load volatile <32 x float>, <32 x float>* %b
%sel = select i1 %mask, <32 x float> %op1, <32 x float> %op2
@ -481,103 +242,20 @@ define void @select_v32f32(<32 x float>* %a, <32 x float>* %b, i1 %mask) #0 {
ret void
}
define void @select_v64f32(<64 x float>* %a, <64 x float>* %b, i1 %mask) #0 {
; NO_SVE-LABEL: select_v64f32:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: stp d11, d10, [sp, #-32]! // 16-byte Folded Spill
; NO_SVE-NEXT: .cfi_def_cfa_offset 32
; NO_SVE-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill
; NO_SVE-NEXT: .cfi_offset b8, -8
; NO_SVE-NEXT: .cfi_offset b9, -16
; NO_SVE-NEXT: .cfi_offset b10, -24
; NO_SVE-NEXT: .cfi_offset b11, -32
; NO_SVE-NEXT: tst w2, #0x1
; NO_SVE-NEXT: ldr q0, [x0, #240]
; NO_SVE-NEXT: csetm w8, ne
; NO_SVE-NEXT: ldr q1, [x0, #224]
; NO_SVE-NEXT: ldr q2, [x0, #208]
; NO_SVE-NEXT: ldr q3, [x0, #192]
; NO_SVE-NEXT: ldr q4, [x0, #176]
; NO_SVE-NEXT: dup v8.4s, w8
; NO_SVE-NEXT: ldr q5, [x0, #160]
; NO_SVE-NEXT: ldr q6, [x0, #144]
; NO_SVE-NEXT: ldr q7, [x0, #128]
; NO_SVE-NEXT: ldr q16, [x0, #112]
; NO_SVE-NEXT: ldr q17, [x0, #96]
; NO_SVE-NEXT: ldr q18, [x0, #80]
; NO_SVE-NEXT: ldr q19, [x0, #64]
; NO_SVE-NEXT: ldr q20, [x0, #48]
; NO_SVE-NEXT: ldr q21, [x0, #32]
; NO_SVE-NEXT: ldr q22, [x0, #16]
; NO_SVE-NEXT: ldr q23, [x0]
; NO_SVE-NEXT: ldr q24, [x1, #240]
; NO_SVE-NEXT: ldr q25, [x1, #224]
; NO_SVE-NEXT: ldr q26, [x1, #208]
; NO_SVE-NEXT: ldr q27, [x1, #192]
; NO_SVE-NEXT: bif v0.16b, v24.16b, v8.16b
; NO_SVE-NEXT: ldr q28, [x1, #176]
; NO_SVE-NEXT: bif v1.16b, v25.16b, v8.16b
; NO_SVE-NEXT: ldr q29, [x1, #160]
; NO_SVE-NEXT: bif v2.16b, v26.16b, v8.16b
; NO_SVE-NEXT: ldr q30, [x1, #144]
; NO_SVE-NEXT: bif v3.16b, v27.16b, v8.16b
; NO_SVE-NEXT: ldr q31, [x1, #128]
; NO_SVE-NEXT: ldr q9, [x1, #112]
; NO_SVE-NEXT: ldr q10, [x1, #96]
; NO_SVE-NEXT: bif v4.16b, v28.16b, v8.16b
; NO_SVE-NEXT: ldr q28, [x1, #80]
; NO_SVE-NEXT: ldr q24, [x1, #64]
; NO_SVE-NEXT: ldr q25, [x1, #48]
; NO_SVE-NEXT: ldr q26, [x1, #32]
; NO_SVE-NEXT: ldr q27, [x1, #16]
; NO_SVE-NEXT: ldr q11, [x1]
; NO_SVE-NEXT: stp q3, q2, [x0, #192]
; NO_SVE-NEXT: stp q1, q0, [x0, #224]
; NO_SVE-NEXT: mov v0.16b, v8.16b
; NO_SVE-NEXT: mov v1.16b, v8.16b
; NO_SVE-NEXT: mov v2.16b, v8.16b
; NO_SVE-NEXT: bsl v0.16b, v5.16b, v29.16b
; NO_SVE-NEXT: bsl v1.16b, v6.16b, v30.16b
; NO_SVE-NEXT: bsl v2.16b, v7.16b, v31.16b
; NO_SVE-NEXT: mov v3.16b, v8.16b
; NO_SVE-NEXT: stp q0, q4, [x0, #160]
; NO_SVE-NEXT: mov v4.16b, v8.16b
; NO_SVE-NEXT: mov v0.16b, v8.16b
; NO_SVE-NEXT: stp q2, q1, [x0, #128]
; NO_SVE-NEXT: mov v1.16b, v8.16b
; NO_SVE-NEXT: bsl v3.16b, v16.16b, v9.16b
; NO_SVE-NEXT: bsl v4.16b, v17.16b, v10.16b
; NO_SVE-NEXT: bsl v0.16b, v18.16b, v28.16b
; NO_SVE-NEXT: bsl v1.16b, v19.16b, v24.16b
; NO_SVE-NEXT: mov v2.16b, v8.16b
; NO_SVE-NEXT: stp q4, q3, [x0, #96]
; NO_SVE-NEXT: mov v3.16b, v8.16b
; NO_SVE-NEXT: mov v4.16b, v8.16b
; NO_SVE-NEXT: stp q1, q0, [x0, #64]
; NO_SVE-NEXT: mov v0.16b, v8.16b
; NO_SVE-NEXT: bsl v2.16b, v20.16b, v25.16b
; NO_SVE-NEXT: bsl v3.16b, v21.16b, v26.16b
; NO_SVE-NEXT: bsl v4.16b, v22.16b, v27.16b
; NO_SVE-NEXT: bsl v0.16b, v23.16b, v11.16b
; NO_SVE-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload
; NO_SVE-NEXT: stp q3, q2, [x0, #32]
; NO_SVE-NEXT: stp q0, q4, [x0]
; NO_SVE-NEXT: ldp d11, d10, [sp], #32 // 16-byte Folded Reload
; NO_SVE-NEXT: ret
;
; VBITS_GE_2048-LABEL: select_v64f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: and w8, w2, #0x1
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_2048-NEXT: ptrue p1.s
; VBITS_GE_2048-NEXT: mov z2.s, w8
; VBITS_GE_2048-NEXT: and z2.s, z2.s, #0x1
; VBITS_GE_2048-NEXT: cmpne p1.s, p1/z, z2.s, #0
; VBITS_GE_2048-NEXT: sel z0.s, p1, z0.s, z1.s
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
define void @select_v64f32(<64 x float>* %a, <64 x float>* %b, i1 %mask) vscale_range(16,0) #0 {
; CHECK-LABEL: select_v64f32:
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: ptrue p1.s
; CHECK-NEXT: mov z2.s, w8
; CHECK-NEXT: and z2.s, z2.s, #0x1
; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <64 x float>, <64 x float>* %a
%op2 = load volatile <64 x float>, <64 x float>* %b
%sel = select i1 %mask, <64 x float> %op1, <64 x float> %op2
@ -586,15 +264,7 @@ define void @select_v64f32(<64 x float>* %a, <64 x float>* %b, i1 %mask) #0 {
}
; Don't use SVE for 64-bit vectors.
define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask) #0 {
; NO_SVE-LABEL: select_v1f64:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: tst w0, #0x1
; NO_SVE-NEXT: csetm x8, ne
; NO_SVE-NEXT: fmov d2, x8
; NO_SVE-NEXT: bif v0.8b, v1.8b, v2.8b
; NO_SVE-NEXT: ret
;
define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
@ -607,15 +277,7 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask
}
; Don't use SVE for 128-bit vectors.
define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask) #0 {
; NO_SVE-LABEL: select_v2f64:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: tst w0, #0x1
; NO_SVE-NEXT: csetm x8, ne
; NO_SVE-NEXT: dup v2.2d, x8
; NO_SVE-NEXT: bif v0.16b, v1.16b, v2.16b
; NO_SVE-NEXT: ret
;
define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
@ -627,21 +289,7 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask
ret <2 x double> %sel
}
define void @select_v4f64(<4 x double>* %a, <4 x double>* %b, i1 %mask) #0 {
; NO_SVE-LABEL: select_v4f64:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: tst w2, #0x1
; NO_SVE-NEXT: ldr q0, [x0]
; NO_SVE-NEXT: csetm x8, ne
; NO_SVE-NEXT: ldr q1, [x0, #16]
; NO_SVE-NEXT: ldr q2, [x1]
; NO_SVE-NEXT: ldr q3, [x1, #16]
; NO_SVE-NEXT: dup v4.2d, x8
; NO_SVE-NEXT: bif v0.16b, v2.16b, v4.16b
; NO_SVE-NEXT: bif v1.16b, v3.16b, v4.16b
; NO_SVE-NEXT: stp q0, q1, [x0]
; NO_SVE-NEXT: ret
;
define void @select_v4f64(<4 x double>* %a, <4 x double>* %b, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1
@ -663,26 +311,24 @@ define void @select_v4f64(<4 x double>* %a, <4 x double>* %b, i1 %mask) #0 {
}
define void @select_v8f64(<8 x double>* %a, <8 x double>* %b, i1 %mask) #0 {
; NO_SVE-LABEL: select_v8f64:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: tst w2, #0x1
; NO_SVE-NEXT: ldr q0, [x0, #48]
; NO_SVE-NEXT: csetm x8, ne
; NO_SVE-NEXT: ldr q1, [x0]
; NO_SVE-NEXT: ldr q2, [x0, #16]
; NO_SVE-NEXT: ldr q3, [x0, #32]
; NO_SVE-NEXT: ldr q4, [x1, #48]
; NO_SVE-NEXT: dup v6.2d, x8
; NO_SVE-NEXT: ldr q5, [x1]
; NO_SVE-NEXT: ldr q7, [x1, #16]
; NO_SVE-NEXT: ldr q16, [x1, #32]
; NO_SVE-NEXT: bif v1.16b, v5.16b, v6.16b
; NO_SVE-NEXT: bif v2.16b, v7.16b, v6.16b
; NO_SVE-NEXT: bif v0.16b, v4.16b, v6.16b
; NO_SVE-NEXT: bif v3.16b, v16.16b, v6.16b
; NO_SVE-NEXT: stp q1, q2, [x0]
; NO_SVE-NEXT: stp q3, q0, [x0, #32]
; NO_SVE-NEXT: ret
; VBITS_GE_256-LABEL: select_v8f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: and w9, w2, #0x1
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ptrue p1.d
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: mov z4.d, x9
; VBITS_GE_256-NEXT: and z4.d, z4.d, #0x1
; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z4.d, #0
; VBITS_GE_256-NEXT: sel z1.d, p1, z1.d, z3.d
; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z2.d
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: select_v8f64:
; VBITS_GE_512: // %bb.0:
@ -704,58 +350,20 @@ define void @select_v8f64(<8 x double>* %a, <8 x double>* %b, i1 %mask) #0 {
ret void
}
define void @select_v16f64(<16 x double>* %a, <16 x double>* %b, i1 %mask) #0 {
; NO_SVE-LABEL: select_v16f64:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: tst w2, #0x1
; NO_SVE-NEXT: ldr q0, [x0, #16]
; NO_SVE-NEXT: csetm x8, ne
; NO_SVE-NEXT: ldr q1, [x0]
; NO_SVE-NEXT: ldr q2, [x0, #48]
; NO_SVE-NEXT: ldr q3, [x0, #32]
; NO_SVE-NEXT: ldr q4, [x0, #80]
; NO_SVE-NEXT: dup v21.2d, x8
; NO_SVE-NEXT: ldr q5, [x0, #64]
; NO_SVE-NEXT: ldr q6, [x0, #112]
; NO_SVE-NEXT: ldr q7, [x0, #96]
; NO_SVE-NEXT: ldr q16, [x1, #16]
; NO_SVE-NEXT: ldr q17, [x1]
; NO_SVE-NEXT: ldr q18, [x1, #48]
; NO_SVE-NEXT: ldr q19, [x1, #32]
; NO_SVE-NEXT: bif v0.16b, v16.16b, v21.16b
; NO_SVE-NEXT: ldr q20, [x1, #80]
; NO_SVE-NEXT: bif v1.16b, v17.16b, v21.16b
; NO_SVE-NEXT: ldr q16, [x1, #64]
; NO_SVE-NEXT: bif v2.16b, v18.16b, v21.16b
; NO_SVE-NEXT: ldr q17, [x1, #112]
; NO_SVE-NEXT: bif v3.16b, v19.16b, v21.16b
; NO_SVE-NEXT: ldr q18, [x1, #96]
; NO_SVE-NEXT: bif v4.16b, v20.16b, v21.16b
; NO_SVE-NEXT: stp q1, q0, [x0]
; NO_SVE-NEXT: mov v0.16b, v21.16b
; NO_SVE-NEXT: mov v1.16b, v21.16b
; NO_SVE-NEXT: stp q3, q2, [x0, #32]
; NO_SVE-NEXT: mov v2.16b, v21.16b
; NO_SVE-NEXT: bsl v0.16b, v5.16b, v16.16b
; NO_SVE-NEXT: bsl v1.16b, v6.16b, v17.16b
; NO_SVE-NEXT: bsl v2.16b, v7.16b, v18.16b
; NO_SVE-NEXT: stp q0, q4, [x0, #64]
; NO_SVE-NEXT: stp q2, q1, [x0, #96]
; NO_SVE-NEXT: ret
;
; VBITS_GE_1024-LABEL: select_v16f64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: and w8, w2, #0x1
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_1024-NEXT: ptrue p1.d
; VBITS_GE_1024-NEXT: mov z2.d, x8
; VBITS_GE_1024-NEXT: and z2.d, z2.d, #0x1
; VBITS_GE_1024-NEXT: cmpne p1.d, p1/z, z2.d, #0
; VBITS_GE_1024-NEXT: sel z0.d, p1, z0.d, z1.d
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
define void @select_v16f64(<16 x double>* %a, <16 x double>* %b, i1 %mask) vscale_range(8,0) #0 {
; CHECK-LABEL: select_v16f64:
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: mov z2.d, x8
; CHECK-NEXT: and z2.d, z2.d, #0x1
; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <16 x double>, <16 x double>* %a
%op2 = load volatile <16 x double>, <16 x double>* %b
%sel = select i1 %mask, <16 x double> %op1, <16 x double> %op2
@ -763,103 +371,20 @@ define void @select_v16f64(<16 x double>* %a, <16 x double>* %b, i1 %mask) #0 {
ret void
}
define void @select_v32f64(<32 x double>* %a, <32 x double>* %b, i1 %mask) #0 {
; NO_SVE-LABEL: select_v32f64:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: stp d11, d10, [sp, #-32]! // 16-byte Folded Spill
; NO_SVE-NEXT: .cfi_def_cfa_offset 32
; NO_SVE-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill
; NO_SVE-NEXT: .cfi_offset b8, -8
; NO_SVE-NEXT: .cfi_offset b9, -16
; NO_SVE-NEXT: .cfi_offset b10, -24
; NO_SVE-NEXT: .cfi_offset b11, -32
; NO_SVE-NEXT: tst w2, #0x1
; NO_SVE-NEXT: ldr q0, [x0, #240]
; NO_SVE-NEXT: csetm x8, ne
; NO_SVE-NEXT: ldr q1, [x0, #224]
; NO_SVE-NEXT: ldr q2, [x0, #208]
; NO_SVE-NEXT: ldr q3, [x0, #192]
; NO_SVE-NEXT: ldr q4, [x0, #176]
; NO_SVE-NEXT: dup v8.2d, x8
; NO_SVE-NEXT: ldr q5, [x0, #160]
; NO_SVE-NEXT: ldr q6, [x0, #144]
; NO_SVE-NEXT: ldr q7, [x0, #128]
; NO_SVE-NEXT: ldr q16, [x0, #112]
; NO_SVE-NEXT: ldr q17, [x0, #96]
; NO_SVE-NEXT: ldr q18, [x0, #80]
; NO_SVE-NEXT: ldr q19, [x0, #64]
; NO_SVE-NEXT: ldr q20, [x0, #48]
; NO_SVE-NEXT: ldr q21, [x0, #32]
; NO_SVE-NEXT: ldr q22, [x0, #16]
; NO_SVE-NEXT: ldr q23, [x0]
; NO_SVE-NEXT: ldr q24, [x1, #240]
; NO_SVE-NEXT: ldr q25, [x1, #224]
; NO_SVE-NEXT: ldr q26, [x1, #208]
; NO_SVE-NEXT: ldr q27, [x1, #192]
; NO_SVE-NEXT: bif v0.16b, v24.16b, v8.16b
; NO_SVE-NEXT: ldr q28, [x1, #176]
; NO_SVE-NEXT: bif v1.16b, v25.16b, v8.16b
; NO_SVE-NEXT: ldr q29, [x1, #160]
; NO_SVE-NEXT: bif v2.16b, v26.16b, v8.16b
; NO_SVE-NEXT: ldr q30, [x1, #144]
; NO_SVE-NEXT: bif v3.16b, v27.16b, v8.16b
; NO_SVE-NEXT: ldr q31, [x1, #128]
; NO_SVE-NEXT: ldr q9, [x1, #112]
; NO_SVE-NEXT: ldr q10, [x1, #96]
; NO_SVE-NEXT: bif v4.16b, v28.16b, v8.16b
; NO_SVE-NEXT: ldr q28, [x1, #80]
; NO_SVE-NEXT: ldr q24, [x1, #64]
; NO_SVE-NEXT: ldr q25, [x1, #48]
; NO_SVE-NEXT: ldr q26, [x1, #32]
; NO_SVE-NEXT: ldr q27, [x1, #16]
; NO_SVE-NEXT: ldr q11, [x1]
; NO_SVE-NEXT: stp q3, q2, [x0, #192]
; NO_SVE-NEXT: stp q1, q0, [x0, #224]
; NO_SVE-NEXT: mov v0.16b, v8.16b
; NO_SVE-NEXT: mov v1.16b, v8.16b
; NO_SVE-NEXT: mov v2.16b, v8.16b
; NO_SVE-NEXT: bsl v0.16b, v5.16b, v29.16b
; NO_SVE-NEXT: bsl v1.16b, v6.16b, v30.16b
; NO_SVE-NEXT: bsl v2.16b, v7.16b, v31.16b
; NO_SVE-NEXT: mov v3.16b, v8.16b
; NO_SVE-NEXT: stp q0, q4, [x0, #160]
; NO_SVE-NEXT: mov v4.16b, v8.16b
; NO_SVE-NEXT: mov v0.16b, v8.16b
; NO_SVE-NEXT: stp q2, q1, [x0, #128]
; NO_SVE-NEXT: mov v1.16b, v8.16b
; NO_SVE-NEXT: bsl v3.16b, v16.16b, v9.16b
; NO_SVE-NEXT: bsl v4.16b, v17.16b, v10.16b
; NO_SVE-NEXT: bsl v0.16b, v18.16b, v28.16b
; NO_SVE-NEXT: bsl v1.16b, v19.16b, v24.16b
; NO_SVE-NEXT: mov v2.16b, v8.16b
; NO_SVE-NEXT: stp q4, q3, [x0, #96]
; NO_SVE-NEXT: mov v3.16b, v8.16b
; NO_SVE-NEXT: mov v4.16b, v8.16b
; NO_SVE-NEXT: stp q1, q0, [x0, #64]
; NO_SVE-NEXT: mov v0.16b, v8.16b
; NO_SVE-NEXT: bsl v2.16b, v20.16b, v25.16b
; NO_SVE-NEXT: bsl v3.16b, v21.16b, v26.16b
; NO_SVE-NEXT: bsl v4.16b, v22.16b, v27.16b
; NO_SVE-NEXT: bsl v0.16b, v23.16b, v11.16b
; NO_SVE-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload
; NO_SVE-NEXT: stp q3, q2, [x0, #32]
; NO_SVE-NEXT: stp q0, q4, [x0]
; NO_SVE-NEXT: ldp d11, d10, [sp], #32 // 16-byte Folded Reload
; NO_SVE-NEXT: ret
;
; VBITS_GE_2048-LABEL: select_v32f64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: and w8, w2, #0x1
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_2048-NEXT: ptrue p1.d
; VBITS_GE_2048-NEXT: mov z2.d, x8
; VBITS_GE_2048-NEXT: and z2.d, z2.d, #0x1
; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z2.d, #0
; VBITS_GE_2048-NEXT: sel z0.d, p1, z0.d, z1.d
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
define void @select_v32f64(<32 x double>* %a, <32 x double>* %b, i1 %mask) vscale_range(16,0) #0 {
; CHECK-LABEL: select_v32f64:
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: mov z2.d, x8
; CHECK-NEXT: and z2.d, z2.d, #0x1
; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <32 x double>, <32 x double>* %a
%op2 = load volatile <32 x double>, <32 x double>* %b
%sel = select i1 %mask, <32 x double> %op1, <32 x double> %op2

File diff suppressed because it is too large Load Diff

View File

@ -1,26 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; Don't use SVE when its registers are no bigger than NEON.
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE for 64-bit vectors.
define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask) #0 {
define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: shl v2.4h, v2.4h, #15
@ -32,7 +18,7 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask
}
; Don't use SVE for 128-bit vectors.
define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask) #0 {
define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v2.8h, v2.8b, #0
@ -44,7 +30,7 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask
ret <8 x half> %sel
}
define void @select_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
define void @select_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -96,44 +82,16 @@ define void @select_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
ret void
}
define void @select_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
; VBITS_GE_256-LABEL: select_v64f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: mov x9, #48
; VBITS_GE_256-NEXT: mov x10, #32
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z2.h, z5.h
; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z1.h, z4.h
; VBITS_GE_256-NEXT: fcmeq p3.h, p0/z, z0.h, z6.h
; VBITS_GE_256-NEXT: fcmeq p4.h, p0/z, z3.h, z7.h
; VBITS_GE_256-NEXT: sel z0.h, p3, z0.h, z6.h
; VBITS_GE_256-NEXT: sel z1.h, p2, z1.h, z4.h
; VBITS_GE_256-NEXT: sel z2.h, p1, z2.h, z5.h
; VBITS_GE_256-NEXT: sel z3.h, p4, z3.h, z7.h
; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: select_v64f16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
; VBITS_GE_1024-NEXT: sel z0.h, p1, z0.h, z1.h
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
define void @select_v64f16(<64 x half>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: select_v64f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%op2 = load <64 x half>, <64 x half>* %b
%mask = fcmp oeq <64 x half> %op1, %op2
@ -142,68 +100,16 @@ define void @select_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
ret void
}
define void @select_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
; VBITS_GE_256-LABEL: select_v128f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: mov x9, #48
; VBITS_GE_256-NEXT: mov x10, #32
; VBITS_GE_256-NEXT: mov x11, #80
; VBITS_GE_256-NEXT: mov x12, #64
; VBITS_GE_256-NEXT: mov x13, #112
; VBITS_GE_256-NEXT: mov x14, #96
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x0, x14, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z16.h }, p0/z, [x1, x13, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z17.h }, p0/z, [x1, x14, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z18.h }, p0/z, [x1, x11, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z19.h }, p0/z, [x1, x12, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z20.h }, p0/z, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z21.h }, p0/z, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z22.h }, p0/z, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z23.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z6.h, z17.h
; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z5.h, z16.h
; VBITS_GE_256-NEXT: fcmeq p3.h, p0/z, z4.h, z19.h
; VBITS_GE_256-NEXT: fcmeq p4.h, p0/z, z3.h, z18.h
; VBITS_GE_256-NEXT: fcmeq p5.h, p0/z, z2.h, z21.h
; VBITS_GE_256-NEXT: fcmeq p6.h, p0/z, z1.h, z20.h
; VBITS_GE_256-NEXT: fcmeq p7.h, p0/z, z0.h, z22.h
; VBITS_GE_256-NEXT: fcmeq p8.h, p0/z, z7.h, z23.h
; VBITS_GE_256-NEXT: sel z0.h, p7, z0.h, z22.h
; VBITS_GE_256-NEXT: sel z1.h, p6, z1.h, z20.h
; VBITS_GE_256-NEXT: sel z2.h, p5, z2.h, z21.h
; VBITS_GE_256-NEXT: sel z3.h, p4, z3.h, z18.h
; VBITS_GE_256-NEXT: sel z4.h, p3, z4.h, z19.h
; VBITS_GE_256-NEXT: sel z5.h, p2, z5.h, z16.h
; VBITS_GE_256-NEXT: sel z6.h, p1, z6.h, z17.h
; VBITS_GE_256-NEXT: sel z7.h, p8, z7.h, z23.h
; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x0, x14, lsl #1]
; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x0, x13, lsl #1]
; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x12, lsl #1]
; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x11, lsl #1]
; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: select_v128f16:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
; VBITS_GE_2048-NEXT: sel z0.h, p1, z0.h, z1.h
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
define void @select_v128f16(<128 x half>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: select_v128f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x half>, <128 x half>* %a
%op2 = load <128 x half>, <128 x half>* %b
%mask = fcmp oeq <128 x half> %op1, %op2
@ -213,7 +119,7 @@ define void @select_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %mask) #0 {
define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: shl v2.2s, v2.2s, #31
@ -225,7 +131,7 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %m
}
; Don't use SVE for 128-bit vectors.
define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %mask) #0 {
define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v2.4s, v2.4h, #0
@ -237,7 +143,7 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %m
ret <4 x float> %sel
}
define void @select_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
define void @select_v8f32(<8 x float>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@ -289,44 +195,16 @@ define void @select_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
ret void
}
define void @select_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
; VBITS_GE_256-LABEL: select_v32f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: mov x9, #24
; VBITS_GE_256-NEXT: mov x10, #16
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z2.s, z5.s
; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z4.s
; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z0.s, z6.s
; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z3.s, z7.s
; VBITS_GE_256-NEXT: sel z0.s, p3, z0.s, z6.s
; VBITS_GE_256-NEXT: sel z1.s, p2, z1.s, z4.s
; VBITS_GE_256-NEXT: sel z2.s, p1, z2.s, z5.s
; VBITS_GE_256-NEXT: sel z3.s, p4, z3.s, z7.s
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: select_v32f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_1024-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
; VBITS_GE_1024-NEXT: sel z0.s, p1, z0.s, z1.s
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
define void @select_v32f32(<32 x float>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: select_v32f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%op2 = load <32 x float>, <32 x float>* %b
%mask = fcmp oeq <32 x float> %op1, %op2
@ -335,68 +213,16 @@ define void @select_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
ret void
}
define void @select_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
; VBITS_GE_256-LABEL: select_v64f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: mov x9, #24
; VBITS_GE_256-NEXT: mov x10, #16
; VBITS_GE_256-NEXT: mov x11, #40
; VBITS_GE_256-NEXT: mov x12, #32
; VBITS_GE_256-NEXT: mov x13, #56
; VBITS_GE_256-NEXT: mov x14, #48
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1, x14, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z18.s }, p0/z, [x1, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z19.s }, p0/z, [x1, x12, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z20.s }, p0/z, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z21.s }, p0/z, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z22.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z23.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z6.s, z17.s
; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z5.s, z16.s
; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z4.s, z19.s
; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z3.s, z18.s
; VBITS_GE_256-NEXT: fcmeq p5.s, p0/z, z2.s, z21.s
; VBITS_GE_256-NEXT: fcmeq p6.s, p0/z, z1.s, z20.s
; VBITS_GE_256-NEXT: fcmeq p7.s, p0/z, z0.s, z22.s
; VBITS_GE_256-NEXT: fcmeq p8.s, p0/z, z7.s, z23.s
; VBITS_GE_256-NEXT: sel z0.s, p7, z0.s, z22.s
; VBITS_GE_256-NEXT: sel z1.s, p6, z1.s, z20.s
; VBITS_GE_256-NEXT: sel z2.s, p5, z2.s, z21.s
; VBITS_GE_256-NEXT: sel z3.s, p4, z3.s, z18.s
; VBITS_GE_256-NEXT: sel z4.s, p3, z4.s, z19.s
; VBITS_GE_256-NEXT: sel z5.s, p2, z5.s, z16.s
; VBITS_GE_256-NEXT: sel z6.s, p1, z6.s, z17.s
; VBITS_GE_256-NEXT: sel z7.s, p8, z7.s, z23.s
; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x0, x14, lsl #2]
; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x0, x13, lsl #2]
; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: select_v64f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
; VBITS_GE_2048-NEXT: sel z0.s, p1, z0.s, z1.s
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
define void @select_v64f32(<64 x float>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: select_v64f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%op2 = load <64 x float>, <64 x float>* %b
%mask = fcmp oeq <64 x float> %op1, %op2
@ -406,7 +232,7 @@ define void @select_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1> %mask) #0 {
define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
@ -419,7 +245,7 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1>
}
; Don't use SVE for 128-bit vectors.
define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1> %mask) #0 {
define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v2.2d, v2.2s, #0
@ -431,7 +257,7 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1>
ret <2 x double> %sel
}
define void @select_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
define void @select_v4f64(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@ -483,44 +309,16 @@ define void @select_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
ret void
}
define void @select_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
; VBITS_GE_256-LABEL: select_v16f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: mov x9, #12
; VBITS_GE_256-NEXT: mov x10, #8
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z2.d, z5.d
; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, z4.d
; VBITS_GE_256-NEXT: fcmeq p3.d, p0/z, z0.d, z6.d
; VBITS_GE_256-NEXT: fcmeq p4.d, p0/z, z3.d, z7.d
; VBITS_GE_256-NEXT: sel z0.d, p3, z0.d, z6.d
; VBITS_GE_256-NEXT: sel z1.d, p2, z1.d, z4.d
; VBITS_GE_256-NEXT: sel z2.d, p1, z2.d, z5.d
; VBITS_GE_256-NEXT: sel z3.d, p4, z3.d, z7.d
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: select_v16f64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_1024-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
; VBITS_GE_1024-NEXT: sel z0.d, p1, z0.d, z1.d
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
define void @select_v16f64(<16 x double>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: select_v16f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%op2 = load <16 x double>, <16 x double>* %b
%mask = fcmp oeq <16 x double> %op1, %op2
@ -529,68 +327,16 @@ define void @select_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
ret void
}
define void @select_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
; VBITS_GE_256-LABEL: select_v32f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: mov x9, #12
; VBITS_GE_256-NEXT: mov x10, #8
; VBITS_GE_256-NEXT: mov x11, #20
; VBITS_GE_256-NEXT: mov x12, #16
; VBITS_GE_256-NEXT: mov x13, #28
; VBITS_GE_256-NEXT: mov x14, #24
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1d { z16.d }, p0/z, [x1, x13, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1, x14, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z18.d }, p0/z, [x1, x11, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z19.d }, p0/z, [x1, x12, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z20.d }, p0/z, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z21.d }, p0/z, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z22.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z23.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z6.d, z17.d
; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z5.d, z16.d
; VBITS_GE_256-NEXT: fcmeq p3.d, p0/z, z4.d, z19.d
; VBITS_GE_256-NEXT: fcmeq p4.d, p0/z, z3.d, z18.d
; VBITS_GE_256-NEXT: fcmeq p5.d, p0/z, z2.d, z21.d
; VBITS_GE_256-NEXT: fcmeq p6.d, p0/z, z1.d, z20.d
; VBITS_GE_256-NEXT: fcmeq p7.d, p0/z, z0.d, z22.d
; VBITS_GE_256-NEXT: fcmeq p8.d, p0/z, z7.d, z23.d
; VBITS_GE_256-NEXT: sel z0.d, p7, z0.d, z22.d
; VBITS_GE_256-NEXT: sel z1.d, p6, z1.d, z20.d
; VBITS_GE_256-NEXT: sel z2.d, p5, z2.d, z21.d
; VBITS_GE_256-NEXT: sel z3.d, p4, z3.d, z18.d
; VBITS_GE_256-NEXT: sel z4.d, p3, z4.d, z19.d
; VBITS_GE_256-NEXT: sel z5.d, p2, z5.d, z16.d
; VBITS_GE_256-NEXT: sel z6.d, p1, z6.d, z17.d
; VBITS_GE_256-NEXT: sel z7.d, p8, z7.d, z23.d
; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0, x14, lsl #3]
; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3]
; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: select_v32f64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_2048-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
; VBITS_GE_2048-NEXT: sel z0.d, p1, z0.d, z1.d
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
define void @select_v32f64(<32 x double>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: select_v32f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%op2 = load <32 x double>, <32 x double>* %b
%mask = fcmp oeq <32 x double> %op1, %op2
@ -599,4 +345,4 @@ define void @select_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
ret void
}
attributes #0 = { "target-features"="+sve" uwtable }
attributes #0 = { "target-features"="+sve" }

View File

@ -1,21 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; Don't use SVE when its registers are no bigger than NEON.
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
@ -24,49 +10,66 @@ target triple = "aarch64-unknown-linux-gnu"
;
; Don't use SVE for 64-bit vectors.
define <4 x half> @insertelement_v4f16(<4 x half> %op1) #0 {
; VBITS_GE_256-LABEL: insertelement_v4f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: fmov h1, #5.00000000
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $q0
; VBITS_GE_256-NEXT: mov v0.h[3], v1.h[0]
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0
; VBITS_GE_256-NEXT: ret
define <4 x half> @insertelement_v4f16(<4 x half> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: insertelement_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov h1, #5.00000000
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov v0.h[3], v1.h[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%r = insertelement <4 x half> %op1, half 5.0, i64 3
ret <4 x half> %r
}
; Don't use SVE for 128-bit vectors.
define <8 x half> @insertelement_v8f16(<8 x half> %op1) #0 {
; VBITS_GE_256-LABEL: insertelement_v8f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: fmov h1, #5.00000000
; VBITS_GE_256-NEXT: mov v0.h[7], v1.h[0]
; VBITS_GE_256-NEXT: ret
define <8 x half> @insertelement_v8f16(<8 x half> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: insertelement_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov h1, #5.00000000
; CHECK-NEXT: mov v0.h[7], v1.h[0]
; CHECK-NEXT: ret
%r = insertelement <8 x half> %op1, half 5.0, i64 7
ret <8 x half> %r
}
define <16 x half> @insertelement_v16f16(<16 x half>* %a) #0 {
; VBITS_GE_256-LABEL: insertelement_v16f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov w9, #15
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: fmov h2, #5.00000000
; VBITS_GE_256-NEXT: index z3.h, #0, #1
; VBITS_GE_256-NEXT: ptrue p1.h
; VBITS_GE_256-NEXT: mov z1.h, w9
; VBITS_GE_256-NEXT: cmpeq p1.h, p1/z, z3.h, z1.h
; VBITS_GE_256-NEXT: mov z0.h, p1/m, h2
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8]
; VBITS_GE_256-NEXT: ret
define <16 x half> @insertelement_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: insertelement_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w9, #15
; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: fmov h2, #5.00000000
; CHECK-NEXT: index z3.h, #0, #1
; CHECK-NEXT: ptrue p1.h
; CHECK-NEXT: mov z1.h, w9
; CHECK-NEXT: cmpeq p1.h, p1/z, z3.h, z1.h
; CHECK-NEXT: mov z0.h, p1/m, h2
; CHECK-NEXT: st1h { z0.h }, p0, [x8]
; CHECK-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a
%r = insertelement <16 x half> %op1, half 5.0, i64 15
ret <16 x half> %r
}
define <32 x half> @insertelement_v32f16(<32 x half>* %a) #0 {
; VBITS_GE_256-LABEL: insertelement_v32f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: mov w10, #15
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: fmov h3, #5.00000000
; VBITS_GE_256-NEXT: index z4.h, #0, #1
; VBITS_GE_256-NEXT: ptrue p1.h
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov z2.h, w10
; VBITS_GE_256-NEXT: cmpeq p1.h, p1/z, z4.h, z2.h
; VBITS_GE_256-NEXT: mov z0.h, p1/m, h3
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x8]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: insertelement_v32f16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov w9, #31
@ -85,88 +88,105 @@ define <32 x half> @insertelement_v32f16(<32 x half>* %a) #0 {
ret <32 x half> %r
}
define <64 x half> @insertelement_v64f16(<64 x half>* %a) #0 {
; VBITS_GE_1024-LABEL: insertelement_v64f16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: mov w9, #63
; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fmov h2, #5.00000000
; VBITS_GE_1024-NEXT: index z3.h, #0, #1
; VBITS_GE_1024-NEXT: ptrue p1.h
; VBITS_GE_1024-NEXT: mov z1.h, w9
; VBITS_GE_1024-NEXT: cmpeq p1.h, p1/z, z3.h, z1.h
; VBITS_GE_1024-NEXT: mov z0.h, p1/m, h2
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
define <64 x half> @insertelement_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: insertelement_v64f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w9, #63
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: fmov h2, #5.00000000
; CHECK-NEXT: index z3.h, #0, #1
; CHECK-NEXT: ptrue p1.h
; CHECK-NEXT: mov z1.h, w9
; CHECK-NEXT: cmpeq p1.h, p1/z, z3.h, z1.h
; CHECK-NEXT: mov z0.h, p1/m, h2
; CHECK-NEXT: st1h { z0.h }, p0, [x8]
; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a
%r = insertelement <64 x half> %op1, half 5.0, i64 63
ret <64 x half> %r
}
define <128 x half> @insertelement_v128f16(<128 x half>* %a) #0 {
; VBITS_GE_2048-LABEL: insertelement_v128f16:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: mov w9, #127
; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fmov h2, #5.00000000
; VBITS_GE_2048-NEXT: index z3.h, #0, #1
; VBITS_GE_2048-NEXT: ptrue p1.h
; VBITS_GE_2048-NEXT: mov z1.h, w9
; VBITS_GE_2048-NEXT: cmpeq p1.h, p1/z, z3.h, z1.h
; VBITS_GE_2048-NEXT: mov z0.h, p1/m, h2
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x8]
; VBITS_GE_2048-NEXT: ret
define <128 x half> @insertelement_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: insertelement_v128f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w9, #127
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: fmov h2, #5.00000000
; CHECK-NEXT: index z3.h, #0, #1
; CHECK-NEXT: ptrue p1.h
; CHECK-NEXT: mov z1.h, w9
; CHECK-NEXT: cmpeq p1.h, p1/z, z3.h, z1.h
; CHECK-NEXT: mov z0.h, p1/m, h2
; CHECK-NEXT: st1h { z0.h }, p0, [x8]
; CHECK-NEXT: ret
%op1 = load <128 x half>, <128 x half>* %a
%r = insertelement <128 x half> %op1, half 5.0, i64 127
ret <128 x half> %r
}
; Don't use SVE for 64-bit vectors.
define <2 x float> @insertelement_v2f32(<2 x float> %op1) #0 {
; VBITS_GE_256-LABEL: insertelement_v2f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: fmov s1, #5.00000000
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $q0
; VBITS_GE_256-NEXT: mov v0.s[1], v1.s[0]
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0
; VBITS_GE_256-NEXT: ret
define <2 x float> @insertelement_v2f32(<2 x float> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: insertelement_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov s1, #5.00000000
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov v0.s[1], v1.s[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%r = insertelement <2 x float> %op1, float 5.0, i64 1
ret <2 x float> %r
}
; Don't use SVE for 128-bit vectors.
define <4 x float> @insertelement_v4f32(<4 x float> %op1) #0 {
; VBITS_GE_256-LABEL: insertelement_v4f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: fmov s1, #5.00000000
; VBITS_GE_256-NEXT: mov v0.s[3], v1.s[0]
; VBITS_GE_256-NEXT: ret
define <4 x float> @insertelement_v4f32(<4 x float> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: insertelement_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov s1, #5.00000000
; CHECK-NEXT: mov v0.s[3], v1.s[0]
; CHECK-NEXT: ret
%r = insertelement <4 x float> %op1, float 5.0, i64 3
ret <4 x float> %r
}
define <8 x float> @insertelement_v8f32(<8 x float>* %a) #0 {
; VBITS_GE_256-LABEL: insertelement_v8f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov w9, #7
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: fmov s2, #5.00000000
; VBITS_GE_256-NEXT: index z3.s, #0, #1
; VBITS_GE_256-NEXT: ptrue p1.s
; VBITS_GE_256-NEXT: mov z1.s, w9
; VBITS_GE_256-NEXT: cmpeq p1.s, p1/z, z3.s, z1.s
; VBITS_GE_256-NEXT: mov z0.s, p1/m, s2
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_256-NEXT: ret
define <8 x float> @insertelement_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: insertelement_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w9, #7
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: fmov s2, #5.00000000
; CHECK-NEXT: index z3.s, #0, #1
; CHECK-NEXT: ptrue p1.s
; CHECK-NEXT: mov z1.s, w9
; CHECK-NEXT: cmpeq p1.s, p1/z, z3.s, z1.s
; CHECK-NEXT: mov z0.s, p1/m, s2
; CHECK-NEXT: st1w { z0.s }, p0, [x8]
; CHECK-NEXT: ret
%op1 = load <8 x float>, <8 x float>* %a
%r = insertelement <8 x float> %op1, float 5.0, i64 7
ret <8 x float> %r
}
define <16 x float> @insertelement_v16f32(<16 x float>* %a) #0 {
; VBITS_GE_256-LABEL: insertelement_v16f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: mov w10, #7
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: fmov s3, #5.00000000
; VBITS_GE_256-NEXT: index z4.s, #0, #1
; VBITS_GE_256-NEXT: ptrue p1.s
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov z2.s, w10
; VBITS_GE_256-NEXT: cmpeq p1.s, p1/z, z4.s, z2.s
; VBITS_GE_256-NEXT: mov z0.s, p1/m, s3
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: insertelement_v16f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov w9, #15
@ -185,86 +205,103 @@ define <16 x float> @insertelement_v16f32(<16 x float>* %a) #0 {
ret <16 x float> %r
}
define <32 x float> @insertelement_v32f32(<32 x float>* %a) #0 {
; VBITS_GE_1024-LABEL: insertelement_v32f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: mov w9, #31
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fmov s2, #5.00000000
; VBITS_GE_1024-NEXT: index z3.s, #0, #1
; VBITS_GE_1024-NEXT: ptrue p1.s
; VBITS_GE_1024-NEXT: mov z1.s, w9
; VBITS_GE_1024-NEXT: cmpeq p1.s, p1/z, z3.s, z1.s
; VBITS_GE_1024-NEXT: mov z0.s, p1/m, s2
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
define <32 x float> @insertelement_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: insertelement_v32f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w9, #31
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: fmov s2, #5.00000000
; CHECK-NEXT: index z3.s, #0, #1
; CHECK-NEXT: ptrue p1.s
; CHECK-NEXT: mov z1.s, w9
; CHECK-NEXT: cmpeq p1.s, p1/z, z3.s, z1.s
; CHECK-NEXT: mov z0.s, p1/m, s2
; CHECK-NEXT: st1w { z0.s }, p0, [x8]
; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%r = insertelement <32 x float> %op1, float 5.0, i64 31
ret <32 x float> %r
}
define <64 x float> @insertelement_v64f32(<64 x float>* %a) #0 {
; VBITS_GE_2048-LABEL: insertelement_v64f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: mov w9, #63
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fmov s2, #5.00000000
; VBITS_GE_2048-NEXT: index z3.s, #0, #1
; VBITS_GE_2048-NEXT: ptrue p1.s
; VBITS_GE_2048-NEXT: mov z1.s, w9
; VBITS_GE_2048-NEXT: cmpeq p1.s, p1/z, z3.s, z1.s
; VBITS_GE_2048-NEXT: mov z0.s, p1/m, s2
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_2048-NEXT: ret
define <64 x float> @insertelement_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: insertelement_v64f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w9, #63
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: fmov s2, #5.00000000
; CHECK-NEXT: index z3.s, #0, #1
; CHECK-NEXT: ptrue p1.s
; CHECK-NEXT: mov z1.s, w9
; CHECK-NEXT: cmpeq p1.s, p1/z, z3.s, z1.s
; CHECK-NEXT: mov z0.s, p1/m, s2
; CHECK-NEXT: st1w { z0.s }, p0, [x8]
; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%r = insertelement <64 x float> %op1, float 5.0, i64 63
ret <64 x float> %r
}
; Don't use SVE for 64-bit vectors.
define <1 x double> @insertelement_v1f64(<1 x double> %op1) #0 {
; VBITS_GE_256-LABEL: insertelement_v1f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4617315517961601024
; VBITS_GE_256-NEXT: fmov d0, x8
; VBITS_GE_256-NEXT: ret
define <1 x double> @insertelement_v1f64(<1 x double> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: insertelement_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x8, #4617315517961601024
; CHECK-NEXT: fmov d0, x8
; CHECK-NEXT: ret
%r = insertelement <1 x double> %op1, double 5.0, i64 0
ret <1 x double> %r
}
; Don't use SVE for 128-bit vectors.
define <2 x double> @insertelement_v2f64(<2 x double> %op1) #0 {
; VBITS_GE_256-LABEL: insertelement_v2f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: fmov d1, #5.00000000
; VBITS_GE_256-NEXT: mov v0.d[1], v1.d[0]
; VBITS_GE_256-NEXT: ret
define <2 x double> @insertelement_v2f64(<2 x double> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: insertelement_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov d1, #5.00000000
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ret
%r = insertelement <2 x double> %op1, double 5.0, i64 1
ret <2 x double> %r
}
define <4 x double> @insertelement_v4f64(<4 x double>* %a) #0 {
; VBITS_GE_256-LABEL: insertelement_v4f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov w9, #3
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: fmov d2, #5.00000000
; VBITS_GE_256-NEXT: index z3.d, #0, #1
; VBITS_GE_256-NEXT: ptrue p1.d
; VBITS_GE_256-NEXT: mov z1.d, x9
; VBITS_GE_256-NEXT: cmpeq p1.d, p1/z, z3.d, z1.d
; VBITS_GE_256-NEXT: mov z0.d, p1/m, d2
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
; VBITS_GE_256-NEXT: ret
define <4 x double> @insertelement_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: insertelement_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w9, #3
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: fmov d2, #5.00000000
; CHECK-NEXT: index z3.d, #0, #1
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: mov z1.d, x9
; CHECK-NEXT: cmpeq p1.d, p1/z, z3.d, z1.d
; CHECK-NEXT: mov z0.d, p1/m, d2
; CHECK-NEXT: st1d { z0.d }, p0, [x8]
; CHECK-NEXT: ret
%op1 = load <4 x double>, <4 x double>* %a
%r = insertelement <4 x double> %op1, double 5.0, i64 3
ret <4 x double> %r
}
define <8 x double> @insertelement_v8f64(<8 x double>* %a) #0 {
; VBITS_GE_256-LABEL: insertelement_v8f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #4
; VBITS_GE_256-NEXT: mov w10, #3
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: fmov d3, #5.00000000
; VBITS_GE_256-NEXT: index z4.d, #0, #1
; VBITS_GE_256-NEXT: ptrue p1.d
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov z2.d, x10
; VBITS_GE_256-NEXT: cmpeq p1.d, p1/z, z4.d, z2.d
; VBITS_GE_256-NEXT: mov z0.d, p1/m, d3
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: insertelement_v8f64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov w9, #7
@ -283,39 +320,39 @@ define <8 x double> @insertelement_v8f64(<8 x double>* %a) #0 {
ret <8 x double> %r
}
define <16 x double> @insertelement_v16f64(<16 x double>* %a) #0 {
; VBITS_GE_1024-LABEL: insertelement_v16f64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: mov w9, #15
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fmov d2, #5.00000000
; VBITS_GE_1024-NEXT: index z3.d, #0, #1
; VBITS_GE_1024-NEXT: ptrue p1.d
; VBITS_GE_1024-NEXT: mov z1.d, x9
; VBITS_GE_1024-NEXT: cmpeq p1.d, p1/z, z3.d, z1.d
; VBITS_GE_1024-NEXT: mov z0.d, p1/m, d2
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
define <16 x double> @insertelement_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: insertelement_v16f64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w9, #15
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: fmov d2, #5.00000000
; CHECK-NEXT: index z3.d, #0, #1
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: mov z1.d, x9
; CHECK-NEXT: cmpeq p1.d, p1/z, z3.d, z1.d
; CHECK-NEXT: mov z0.d, p1/m, d2
; CHECK-NEXT: st1d { z0.d }, p0, [x8]
; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%r = insertelement <16 x double> %op1, double 5.0, i64 15
ret <16 x double> %r
}
define <32 x double> @insertelement_v32f64(<32 x double>* %a) #0 {
; VBITS_GE_2048-LABEL: insertelement_v32f64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: mov w9, #31
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fmov d2, #5.00000000
; VBITS_GE_2048-NEXT: index z3.d, #0, #1
; VBITS_GE_2048-NEXT: ptrue p1.d
; VBITS_GE_2048-NEXT: mov z1.d, x9
; VBITS_GE_2048-NEXT: cmpeq p1.d, p1/z, z3.d, z1.d
; VBITS_GE_2048-NEXT: mov z0.d, p1/m, d2
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8]
; VBITS_GE_2048-NEXT: ret
define <32 x double> @insertelement_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: insertelement_v32f64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w9, #31
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: fmov d2, #5.00000000
; CHECK-NEXT: index z3.d, #0, #1
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: mov z1.d, x9
; CHECK-NEXT: cmpeq p1.d, p1/z, z3.d, z1.d
; CHECK-NEXT: mov z0.d, p1/m, d2
; CHECK-NEXT: st1d { z0.d }, p0, [x8]
; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%r = insertelement <32 x double> %op1, double 5.0, i64 31
ret <32 x double> %r

File diff suppressed because it is too large Load Diff

View File

@ -1,58 +1,46 @@
; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE when its registers are no bigger than NEON.
; NO_SVE-NOT: z{0-9}
;
; ICMP EQ
;
; Don't use SVE for 64-bit vectors.
define <8 x i8> @icmp_eq_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
define <8 x i8> @icmp_eq_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v8i8:
; CHECK: cmeq v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: cmeq v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
%cmp = icmp eq <8 x i8> %op1, %op2
%sext = sext <8 x i1> %cmp to <8 x i8>
ret <8 x i8> %sext
}
; Don't use SVE for 128-bit vectors.
define <16 x i8> @icmp_eq_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
define <16 x i8> @icmp_eq_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v16i8:
; CHECK: cmeq v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: cmeq v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%cmp = icmp eq <16 x i8> %op1, %op2
%sext = sext <16 x i1> %cmp to <16 x i8>
ret <16 x i8> %sext
}
define void @icmp_eq_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
define void @icmp_eq_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v32i8:
; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; CHECK-NEXT: cmpeq [[CMP:p[0-9]+]].b, [[PG]]/z, [[OP1]].b, [[OP2]].b
; CHECK-NEXT: mov [[SEXT:z[0-9]+]].b, [[CMP]]/z, #-1
; CHECK-NEXT: st1b { [[SEXT]].b }, [[PG]], [x0]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl32
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
; CHECK-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%cmp = icmp eq <32 x i8> %op1, %op2
@ -62,29 +50,31 @@ define void @icmp_eq_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
}
define void @icmp_eq_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
; CHECK-LABEL: icmp_eq_v64i8:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; VBITS_GE_512-NEXT: cmpeq [[CMP:p[0-9]+]].b, [[PG]]/z, [[OP1]].b, [[OP2]].b
; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].b, [[CMP]]/z, #-1
; VBITS_GE_512-NEXT: st1b { [[SEXT]].b }, [[PG]], [x0]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
; VBITS_EQ_256-DAG: ld1b { [[OP1_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1]
; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1, x[[NUMELTS]]]
; VBITS_EQ_256-DAG: cmpeq [[CMP_LO:p[0-9]+]].b, [[PG]]/z, [[OP1_LO]].b, [[OP2_LO]].b
; VBITS_EQ_256-DAG: cmpeq [[CMP_HI:p[0-9]+]].b, [[PG]]/z, [[OP1_HI]].b, [[OP2_HI]].b
; VBITS_EQ_256-DAG: mov [[SEXT_LO:z[0-9]+]].b, [[CMP_LO]]/z, #-1
; VBITS_EQ_256-DAG: mov [[SEXT_HI:z[0-9]+]].b, [[CMP_HI]]/z, #-1
; VBITS_EQ_256-DAG: st1b { [[SEXT_LO]].b }, [[PG]], [x0]
; VBITS_EQ_256-DAG: st1b { [[SEXT_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
; VBITS_EQ_256-NEXT: ret
; VBITS_GE_256-LABEL: icmp_eq_v64i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov w8, #32
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z0.b, z2.b
; VBITS_GE_256-NEXT: cmpeq p2.b, p0/z, z1.b, z3.b
; VBITS_GE_256-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z1.b, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: icmp_eq_v64i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.b, vl64
; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
; VBITS_GE_512-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%cmp = icmp eq <64 x i8> %op1, %op2
@ -93,15 +83,16 @@ define void @icmp_eq_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
ret void
}
define void @icmp_eq_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
define void @icmp_eq_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: icmp_eq_v128i8:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; VBITS_GE_1024-NEXT: cmpeq [[CMP:p[0-9]+]].b, [[PG]]/z, [[OP1]].b, [[OP2]].b
; VBITS_GE_1024-NEXT: mov [[SEXT:z[0-9]+]].b, [[CMP]]/z, #-1
; VBITS_GE_1024-NEXT: st1b { [[SEXT]].b }, [[PG]], [x0]
; VBITS_GE_1024-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl128
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
; CHECK-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%cmp = icmp eq <128 x i8> %op1, %op2
@ -110,15 +101,16 @@ define void @icmp_eq_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
ret void
}
define void @icmp_eq_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
define void @icmp_eq_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: icmp_eq_v256i8:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].b, [[PG]]/z, [[OP1]].b, [[OP2]].b
; VBITS_GE_2048-NEXT: mov [[SEXT:z[0-9]+]].b, [[CMP]]/z, #-1
; VBITS_GE_2048-NEXT: st1b { [[SEXT]].b }, [[PG]], [x0]
; VBITS_GE_2048-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl256
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
; CHECK-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
%cmp = icmp eq <256 x i8> %op1, %op2
@ -128,34 +120,37 @@ define void @icmp_eq_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
define <4 x i16> @icmp_eq_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
define <4 x i16> @icmp_eq_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v4i16:
; CHECK: cmeq v0.4h, v0.4h, v1.4h
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: cmeq v0.4h, v0.4h, v1.4h
; CHECK-NEXT: ret
%cmp = icmp eq <4 x i16> %op1, %op2
%sext = sext <4 x i1> %cmp to <4 x i16>
ret <4 x i16> %sext
}
; Don't use SVE for 128-bit vectors.
define <8 x i16> @icmp_eq_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
define <8 x i16> @icmp_eq_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v8i16:
; CHECK: cmeq v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: cmeq v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%cmp = icmp eq <8 x i16> %op1, %op2
%sext = sext <8 x i1> %cmp to <8 x i16>
ret <8 x i16> %sext
}
define void @icmp_eq_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
define void @icmp_eq_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v16i16:
; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; CHECK-NEXT: cmpeq [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h
; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1
; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x0]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%cmp = icmp eq <16 x i16> %op1, %op2
@ -165,29 +160,31 @@ define void @icmp_eq_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
}
define void @icmp_eq_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
; CHECK-LABEL: icmp_eq_v32i16:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_GE_512-NEXT: cmpeq [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h
; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1
; VBITS_GE_512-NEXT: st1h { [[SEXT]].h }, [[PG]], [x0]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
; VBITS_EQ_256-DAG: cmpeq [[CMP_LO:p[0-9]+]].h, [[PG]]/z, [[OP1_LO]].h, [[OP2_LO]].h
; VBITS_EQ_256-DAG: cmpeq [[CMP_HI:p[0-9]+]].h, [[PG]]/z, [[OP1_HI]].h, [[OP2_HI]].h
; VBITS_EQ_256-DAG: mov [[SEXT_LO:z[0-9]+]].h, [[CMP_LO]]/z, #-1
; VBITS_EQ_256-DAG: mov [[SEXT_HI:z[0-9]+]].h, [[CMP_HI]]/z, #-1
; VBITS_EQ_256-DAG: st1h { [[SEXT_LO]].h }, [[PG]], [x0]
; VBITS_EQ_256-DAG: st1h { [[SEXT_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
; VBITS_EQ_256-NEXT: ret
; VBITS_GE_256-LABEL: icmp_eq_v32i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, z2.h
; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z1.h, z3.h
; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: icmp_eq_v32i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
; VBITS_GE_512-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%cmp = icmp eq <32 x i16> %op1, %op2
@ -196,15 +193,16 @@ define void @icmp_eq_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
ret void
}
define void @icmp_eq_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
define void @icmp_eq_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: icmp_eq_v64i16:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_GE_1024-NEXT: cmpeq [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h
; VBITS_GE_1024-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1
; VBITS_GE_1024-NEXT: st1h { [[SEXT]].h }, [[PG]], [x0]
; VBITS_GE_1024-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%cmp = icmp eq <64 x i16> %op1, %op2
@ -213,15 +211,16 @@ define void @icmp_eq_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
ret void
}
define void @icmp_eq_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
define void @icmp_eq_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: icmp_eq_v128i16:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h
; VBITS_GE_2048-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1
; VBITS_GE_2048-NEXT: st1h { [[SEXT]].h }, [[PG]], [x0]
; VBITS_GE_2048-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
%cmp = icmp eq <128 x i16> %op1, %op2
@ -231,34 +230,37 @@ define void @icmp_eq_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
define <2 x i32> @icmp_eq_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
define <2 x i32> @icmp_eq_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v2i32:
; CHECK: cmeq v0.2s, v0.2s, v1.2s
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: cmeq v0.2s, v0.2s, v1.2s
; CHECK-NEXT: ret
%cmp = icmp eq <2 x i32> %op1, %op2
%sext = sext <2 x i1> %cmp to <2 x i32>
ret <2 x i32> %sext
}
; Don't use SVE for 128-bit vectors.
define <4 x i32> @icmp_eq_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
define <4 x i32> @icmp_eq_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v4i32:
; CHECK: cmeq v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
%cmp = icmp eq <4 x i32> %op1, %op2
%sext = sext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %sext
}
define void @icmp_eq_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
define void @icmp_eq_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v8i32:
; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; CHECK-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP1]].s, [[OP2]].s
; CHECK-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1
; CHECK-NEXT: st1w { [[SEXT]].s }, [[PG]], [x0]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%cmp = icmp eq <8 x i32> %op1, %op2
@ -268,29 +270,31 @@ define void @icmp_eq_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
}
define void @icmp_eq_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
; CHECK-LABEL: icmp_eq_v16i32:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_GE_512-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP1]].s, [[OP2]].s
; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1
; VBITS_GE_512-NEXT: st1w { [[SEXT]].s }, [[PG]], [x0]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
; VBITS_EQ_256-DAG: cmpeq [[CMP_LO:p[0-9]+]].s, [[PG]]/z, [[OP1_LO]].s, [[OP2_LO]].s
; VBITS_EQ_256-DAG: cmpeq [[CMP_HI:p[0-9]+]].s, [[PG]]/z, [[OP1_HI]].s, [[OP2_HI]].s
; VBITS_EQ_256-DAG: mov [[SEXT_LO:z[0-9]+]].s, [[CMP_LO]]/z, #-1
; VBITS_EQ_256-DAG: mov [[SEXT_HI:z[0-9]+]].s, [[CMP_HI]]/z, #-1
; VBITS_EQ_256-DAG: st1w { [[SEXT_LO]].s }, [[PG]], [x0]
; VBITS_EQ_256-DAG: st1w { [[SEXT_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
; VBITS_EQ_256-NEXT: ret
; VBITS_GE_256-LABEL: icmp_eq_v16i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z2.s
; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, z3.s
; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: icmp_eq_v16i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
; VBITS_GE_512-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%cmp = icmp eq <16 x i32> %op1, %op2
@ -299,15 +303,16 @@ define void @icmp_eq_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
ret void
}
define void @icmp_eq_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
define void @icmp_eq_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: icmp_eq_v32i32:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_GE_1024-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP1]].s, [[OP2]].s
; VBITS_GE_1024-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1
; VBITS_GE_1024-NEXT: st1w { [[SEXT]].s }, [[PG]], [x0]
; VBITS_GE_1024-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%cmp = icmp eq <32 x i32> %op1, %op2
@ -316,15 +321,16 @@ define void @icmp_eq_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
ret void
}
define void @icmp_eq_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
define void @icmp_eq_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: icmp_eq_v64i32:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP1]].s, [[OP2]].s
; VBITS_GE_2048-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1
; VBITS_GE_2048-NEXT: st1w { [[SEXT]].s }, [[PG]], [x0]
; VBITS_GE_2048-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
%cmp = icmp eq <64 x i32> %op1, %op2
@ -334,34 +340,37 @@ define void @icmp_eq_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
define <1 x i64> @icmp_eq_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
define <1 x i64> @icmp_eq_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v1i64:
; CHECK: cmeq d0, d0, d1
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: cmeq d0, d0, d1
; CHECK-NEXT: ret
%cmp = icmp eq <1 x i64> %op1, %op2
%sext = sext <1 x i1> %cmp to <1 x i64>
ret <1 x i64> %sext
}
; Don't use SVE for 128-bit vectors.
define <2 x i64> @icmp_eq_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
define <2 x i64> @icmp_eq_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v2i64:
; CHECK: cmeq v0.2d, v0.2d, v1.2d
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: cmeq v0.2d, v0.2d, v1.2d
; CHECK-NEXT: ret
%cmp = icmp eq <2 x i64> %op1, %op2
%sext = sext <2 x i1> %cmp to <2 x i64>
ret <2 x i64> %sext
}
define void @icmp_eq_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
define void @icmp_eq_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v4i64:
; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; CHECK-NEXT: cmpeq [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d
; CHECK-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1
; CHECK-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%cmp = icmp eq <4 x i64> %op1, %op2
@ -371,29 +380,31 @@ define void @icmp_eq_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
}
define void @icmp_eq_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
; CHECK-LABEL: icmp_eq_v8i64:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; VBITS_GE_512-NEXT: cmpeq [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d
; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1
; VBITS_GE_512-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
; VBITS_EQ_256-DAG: cmpeq [[CMP_LO:p[0-9]+]].d, [[PG]]/z, [[OP1_LO]].d, [[OP2_LO]].d
; VBITS_EQ_256-DAG: cmpeq [[CMP_HI:p[0-9]+]].d, [[PG]]/z, [[OP1_HI]].d, [[OP2_HI]].d
; VBITS_EQ_256-DAG: mov [[SEXT_LO:z[0-9]+]].d, [[CMP_LO]]/z, #-1
; VBITS_EQ_256-DAG: mov [[SEXT_HI:z[0-9]+]].d, [[CMP_HI]]/z, #-1
; VBITS_EQ_256-DAG: st1d { [[SEXT_LO]].d }, [[PG]], [x0]
; VBITS_EQ_256-DAG: st1d { [[SEXT_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
; VBITS_EQ_256-NEXT: ret
; VBITS_GE_256-LABEL: icmp_eq_v8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d
; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, z3.d
; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: icmp_eq_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
; VBITS_GE_512-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%cmp = icmp eq <8 x i64> %op1, %op2
@ -402,15 +413,16 @@ define void @icmp_eq_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
ret void
}
define void @icmp_eq_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
define void @icmp_eq_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: icmp_eq_v16i64:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; VBITS_GE_1024-NEXT: cmpeq [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d
; VBITS_GE_1024-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1
; VBITS_GE_1024-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0]
; VBITS_GE_1024-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%cmp = icmp eq <16 x i64> %op1, %op2
@ -419,15 +431,16 @@ define void @icmp_eq_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
ret void
}
define void @icmp_eq_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
define void @icmp_eq_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: icmp_eq_v32i64:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d
; VBITS_GE_2048-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1
; VBITS_GE_2048-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0]
; VBITS_GE_2048-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%cmp = icmp eq <32 x i64> %op1, %op2
@ -440,15 +453,16 @@ define void @icmp_eq_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
; ICMP NE
;
define void @icmp_ne_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
define void @icmp_ne_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_ne_v32i8:
; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; CHECK-NEXT: cmpne [[CMP:p[0-9]+]].b, [[PG]]/z, [[OP1]].b, [[OP2]].b
; CHECK-NEXT: mov [[SEXT:z[0-9]+]].b, [[CMP]]/z, #-1
; CHECK-NEXT: st1b { [[SEXT]].b }, [[PG]], [x0]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl32
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: cmpne p1.b, p0/z, z0.b, z1.b
; CHECK-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%cmp = icmp ne <32 x i8> %op1, %op2
@ -461,15 +475,16 @@ define void @icmp_ne_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
; ICMP SGE
;
define void @icmp_sge_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
define void @icmp_sge_v32i16(<32 x i16>* %a, <32 x i16>* %b) vscale_range(4,0) #0 {
; CHECK-LABEL: icmp_sge_v32i16:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_GE_512-NEXT: cmpge [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h
; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1
; VBITS_GE_512-NEXT: st1h { [[SEXT]].h }, [[PG]], [x0]
; VBITS_GE_512-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl32
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: cmpge p1.h, p0/z, z0.h, z1.h
; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%cmp = icmp sge <32 x i16> %op1, %op2
@ -482,15 +497,16 @@ define void @icmp_sge_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
; ICMP SGT
;
define void @icmp_sgt_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
define void @icmp_sgt_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_sgt_v16i16:
; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; CHECK-NEXT: cmpgt [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h
; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1
; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x0]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: cmpgt p1.h, p0/z, z0.h, z1.h
; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%cmp = icmp sgt <16 x i16> %op1, %op2
@ -503,15 +519,16 @@ define void @icmp_sgt_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
; ICMP SLE
;
define void @icmp_sle_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
define void @icmp_sle_v16i32(<16 x i32>* %a, <16 x i32>* %b) vscale_range(4,0) #0 {
; CHECK-LABEL: icmp_sle_v16i32:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_GE_512-NEXT: cmpge [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP2]].s, [[OP1]].s
; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1
; VBITS_GE_512-NEXT: st1w { [[SEXT]].s }, [[PG]], [x0]
; VBITS_GE_512-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl16
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: cmpge p1.s, p0/z, z1.s, z0.s
; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%cmp = icmp sle <16 x i32> %op1, %op2
@ -524,15 +541,16 @@ define void @icmp_sle_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
; ICMP SLT
;
define void @icmp_slt_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
define void @icmp_slt_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_slt_v8i32:
; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; CHECK-NEXT: cmpgt [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP2]].s, [[OP1]].s
; CHECK-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1
; CHECK-NEXT: st1w { [[SEXT]].s }, [[PG]], [x0]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: cmpgt p1.s, p0/z, z1.s, z0.s
; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%cmp = icmp slt <8 x i32> %op1, %op2
@ -545,15 +563,16 @@ define void @icmp_slt_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
; ICMP UGE
;
define void @icmp_uge_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
define void @icmp_uge_v8i64(<8 x i64>* %a, <8 x i64>* %b) vscale_range(4,0) #0 {
; CHECK-LABEL: icmp_uge_v8i64:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; VBITS_GE_512-NEXT: cmphs [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d
; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1
; VBITS_GE_512-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0]
; VBITS_GE_512-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl8
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: cmphs p1.d, p0/z, z0.d, z1.d
; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
%cmp = icmp uge <8 x i64> %op1, %op2
@ -566,15 +585,16 @@ define void @icmp_uge_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
; ICMP UGT
;
define void @icmp_ugt_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
define void @icmp_ugt_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_ugt_v4i64:
; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; CHECK-NEXT: cmphi [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d
; CHECK-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1
; CHECK-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: cmphi p1.d, p0/z, z0.d, z1.d
; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
%cmp = icmp ugt <4 x i64> %op1, %op2
@ -587,15 +607,16 @@ define void @icmp_ugt_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
; ICMP ULE
;
define void @icmp_ule_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
define void @icmp_ule_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: icmp_ule_v16i64:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; VBITS_GE_1024-NEXT: cmphs [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP2]].d, [[OP1]].d
; VBITS_GE_1024-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1
; VBITS_GE_1024-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0]
; VBITS_GE_1024-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: cmphs p1.d, p0/z, z1.d, z0.d
; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
%cmp = icmp ule <16 x i64> %op1, %op2
@ -608,15 +629,16 @@ define void @icmp_ule_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
; ICMP ULT
;
define void @icmp_ult_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
define void @icmp_ult_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: icmp_ult_v32i64:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; VBITS_GE_2048-NEXT: cmphi [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP2]].d, [[OP1]].d
; VBITS_GE_2048-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1
; VBITS_GE_2048-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0]
; VBITS_GE_2048-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: cmphi p1.d, p0/z, z1.d, z0.d
; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
%cmp = icmp ult <32 x i64> %op1, %op2

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,62 +1,50 @@
; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32
; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32
; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE when its registers are no bigger than NEON.
; NO_SVE-NOT: ptrue
; Don't use SVE for 64-bit vectors.
define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) #0 {
define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v8i8:
; CHECK: tst w0, #0x1
; CHECK-NEXT: csetm w8, ne
; CHECK-NEXT: dup v2.8b, w8
; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: csetm w8, ne
; CHECK-NEXT: dup v2.8b, w8
; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
; CHECK-NEXT: ret
%sel = select i1 %mask, <8 x i8> %op1, <8 x i8> %op2
ret <8 x i8> %sel
}
; Don't use SVE for 128-bit vectors.
define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) #0 {
define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v16i8:
; CHECK: tst w0, #0x1
; CHECK-NEXT: csetm w8, ne
; CHECK-NEXT: dup v2.16b, w8
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: csetm w8, ne
; CHECK-NEXT: dup v2.16b, w8
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
; CHECK-NEXT: ret
%sel = select i1 %mask, <16 x i8> %op1, <16 x i8> %op2
ret <16 x i8> %sel
}
define void @select_v32i8(<32 x i8>* %a, <32 x i8>* %b, i1 %mask) #0 {
define void @select_v32i8(<32 x i8>* %a, <32 x i8>* %b, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v32i8:
; CHECK: and w[[AND:[0-9]+]], w2, #0x1
; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
; CHECK-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
; CHECK-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].b
; CHECK-NEXT: mov [[TMP1:z[0-9]+]].b, w[[AND]]
; CHECK-NEXT: and [[TMP2:z[0-9]+]].b, [[TMP1]].b, #0x1
; CHECK-NEXT: cmpne [[PRES:p[0-9]+]].b, [[PG2]]/z, [[TMP2]].b, #0
; CHECK-NEXT: sel [[RES:z[0-9]+]].b, [[PRES]], [[OP1]].b, [[OP2]].b
; CHECK-NEXT: st1b { [[RES]].b }, [[PG1]], [x0]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1
; CHECK-NEXT: ptrue p0.b, vl32
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: mov z2.b, w8
; CHECK-NEXT: and z2.b, z2.b, #0x1
; CHECK-NEXT: cmpne p1.b, p1/z, z2.b, #0
; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <32 x i8>, <32 x i8>* %a
%op2 = load volatile <32 x i8>, <32 x i8>* %b
%sel = select i1 %mask, <32 x i8> %op1, <32 x i8> %op2
@ -65,18 +53,38 @@ define void @select_v32i8(<32 x i8>* %a, <32 x i8>* %b, i1 %mask) #0 {
}
define void @select_v64i8(<64 x i8>* %a, <64 x i8>* %b, i1 %mask) #0 {
; CHECK-LABEL: select_v64i8:
; VBITS_GE_512: and w[[AND:[0-9]+]], w2, #0x1
; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
; VBITS_GE_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
; VBITS_GE_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].b
; VBITS_GE_512-NEXT: mov [[TMP1:z[0-9]+]].b, w[[AND]]
; VBITS_GE_512-NEXT: and [[TMP2:z[0-9]+]].b, [[TMP1]].b, #0x1
; VBITS_GE_512-NEXT: cmpne [[PRES:p[0-9]+]].b, [[PG2]]/z, [[TMP2]].b, #0
; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].b, [[PRES]], [[OP1]].b, [[OP2]].b
; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG1]], [x0]
; VBITS_GE_512-NEXT: ret
; VBITS_GE_256-LABEL: select_v64i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov w8, #32
; VBITS_GE_256-NEXT: and w9, w2, #0x1
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: ptrue p1.b
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
; VBITS_GE_256-NEXT: mov z4.b, w9
; VBITS_GE_256-NEXT: and z4.b, z4.b, #0x1
; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z4.b, #0
; VBITS_GE_256-NEXT: sel z1.b, p1, z1.b, z3.b
; VBITS_GE_256-NEXT: sel z0.b, p1, z0.b, z2.b
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: select_v64i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: and w8, w2, #0x1
; VBITS_GE_512-NEXT: ptrue p0.b, vl64
; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
; VBITS_GE_512-NEXT: ptrue p1.b
; VBITS_GE_512-NEXT: mov z2.b, w8
; VBITS_GE_512-NEXT: and z2.b, z2.b, #0x1
; VBITS_GE_512-NEXT: cmpne p1.b, p1/z, z2.b, #0
; VBITS_GE_512-NEXT: sel z0.b, p1, z0.b, z1.b
; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load volatile <64 x i8>, <64 x i8>* %a
%op2 = load volatile <64 x i8>, <64 x i8>* %b
%sel = select i1 %mask, <64 x i8> %op1, <64 x i8> %op2
@ -84,19 +92,20 @@ define void @select_v64i8(<64 x i8>* %a, <64 x i8>* %b, i1 %mask) #0 {
ret void
}
define void @select_v128i8(<128 x i8>* %a, <128 x i8>* %b, i1 %mask) #0 {
define void @select_v128i8(<128 x i8>* %a, <128 x i8>* %b, i1 %mask) vscale_range(8,0) #0 {
; CHECK-LABEL: select_v128i8:
; VBITS_GE_1024: and w[[AND:[0-9]+]], w2, #0x1
; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].b
; VBITS_GE_1024-NEXT: mov [[TMP1:z[0-9]+]].b, w[[AND]]
; VBITS_GE_1024-NEXT: and [[TMP2:z[0-9]+]].b, [[TMP1]].b, #0x1
; VBITS_GE_1024-NEXT: cmpne [[PRES:p[0-9]+]].b, [[PG2]]/z, [[TMP2]].b, #0
; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].b, [[PRES]], [[OP1]].b, [[OP2]].b
; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG1]], [x0]
; VBITS_GE_1024-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1
; CHECK-NEXT: ptrue p0.b, vl128
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: mov z2.b, w8
; CHECK-NEXT: and z2.b, z2.b, #0x1
; CHECK-NEXT: cmpne p1.b, p1/z, z2.b, #0
; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <128 x i8>, <128 x i8>* %a
%op2 = load volatile <128 x i8>, <128 x i8>* %b
%sel = select i1 %mask, <128 x i8> %op1, <128 x i8> %op2
@ -104,19 +113,20 @@ define void @select_v128i8(<128 x i8>* %a, <128 x i8>* %b, i1 %mask) #0 {
ret void
}
define void @select_v256i8(<256 x i8>* %a, <256 x i8>* %b, i1 %mask) #0 {
define void @select_v256i8(<256 x i8>* %a, <256 x i8>* %b, i1 %mask) vscale_range(16,0) #0 {
; CHECK-LABEL: select_v256i8:
; VBITS_GE_2048: and w[[AND:[0-9]+]], w2, #0x1
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].b
; VBITS_GE_2048-NEXT: mov [[TMP1:z[0-9]+]].b, w[[AND]]
; VBITS_GE_2048-NEXT: and [[TMP2:z[0-9]+]].b, [[TMP1]].b, #0x1
; VBITS_GE_2048-NEXT: cmpne [[PRES:p[0-9]+]].b, [[PG2]]/z, [[TMP2]].b, #0
; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].b, [[PRES]], [[OP1]].b, [[OP2]].b
; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG1]], [x0]
; VBITS_GE_2048-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1
; CHECK-NEXT: ptrue p0.b, vl256
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: mov z2.b, w8
; CHECK-NEXT: and z2.b, z2.b, #0x1
; CHECK-NEXT: cmpne p1.b, p1/z, z2.b, #0
; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <256 x i8>, <256 x i8>* %a
%op2 = load volatile <256 x i8>, <256 x i8>* %b
%sel = select i1 %mask, <256 x i8> %op1, <256 x i8> %op2
@ -125,42 +135,45 @@ define void @select_v256i8(<256 x i8>* %a, <256 x i8>* %b, i1 %mask) #0 {
}
; Don't use SVE for 64-bit vectors.
define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) #0 {
define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4i16:
; CHECK: tst w0, #0x1
; CHECK-NEXT: csetm w8, ne
; CHECK-NEXT: dup v2.4h, w8
; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: csetm w8, ne
; CHECK-NEXT: dup v2.4h, w8
; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
; CHECK-NEXT: ret
%sel = select i1 %mask, <4 x i16> %op1, <4 x i16> %op2
ret <4 x i16> %sel
}
; Don't use SVE for 128-bit vectors.
define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) #0 {
define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v8i16:
; CHECK: tst w0, #0x1
; CHECK-NEXT: csetm w8, ne
; CHECK-NEXT: dup v2.8h, w8
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: csetm w8, ne
; CHECK-NEXT: dup v2.8h, w8
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
; CHECK-NEXT: ret
%sel = select i1 %mask, <8 x i16> %op1, <8 x i16> %op2
ret <8 x i16> %sel
}
define void @select_v16i16(<16 x i16>* %a, <16 x i16>* %b, i1 %mask) #0 {
define void @select_v16i16(<16 x i16>* %a, <16 x i16>* %b, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v16i16:
; CHECK: and w[[AND:[0-9]+]], w2, #0x1
; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
; CHECK-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
; CHECK-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].h
; CHECK-NEXT: mov [[TMP1:z[0-9]+]].h, w[[AND]]
; CHECK-NEXT: and [[TMP2:z[0-9]+]].h, [[TMP1]].h, #0x1
; CHECK-NEXT: cmpne [[PRES:p[0-9]+]].h, [[PG2]]/z, [[TMP2]].h, #0
; CHECK-NEXT: sel [[RES:z[0-9]+]].h, [[PRES]], [[OP1]].h, [[OP2]].h
; CHECK-NEXT: st1h { [[RES]].h }, [[PG1]], [x0]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1
; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: ptrue p1.h
; CHECK-NEXT: mov z2.h, w8
; CHECK-NEXT: and z2.h, z2.h, #0x1
; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <16 x i16>, <16 x i16>* %a
%op2 = load volatile <16 x i16>, <16 x i16>* %b
%sel = select i1 %mask, <16 x i16> %op1, <16 x i16> %op2
@ -169,18 +182,38 @@ define void @select_v16i16(<16 x i16>* %a, <16 x i16>* %b, i1 %mask) #0 {
}
define void @select_v32i16(<32 x i16>* %a, <32 x i16>* %b, i1 %mask) #0 {
; CHECK-LABEL: select_v32i16:
; VBITS_GE_512: and w[[AND:[0-9]+]], w2, #0x1
; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].h
; VBITS_GE_512-NEXT: mov [[TMP1:z[0-9]+]].h, w[[AND]]
; VBITS_GE_512-NEXT: and [[TMP2:z[0-9]+]].h, [[TMP1]].h, #0x1
; VBITS_GE_512-NEXT: cmpne [[PRES:p[0-9]+]].h, [[PG2]]/z, [[TMP2]].h, #0
; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].h, [[PRES]], [[OP1]].h, [[OP2]].h
; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG1]], [x0]
; VBITS_GE_512-NEXT: ret
; VBITS_GE_256-LABEL: select_v32i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: and w9, w2, #0x1
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ptrue p1.h
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: mov z4.h, w9
; VBITS_GE_256-NEXT: and z4.h, z4.h, #0x1
; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z4.h, #0
; VBITS_GE_256-NEXT: sel z1.h, p1, z1.h, z3.h
; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z2.h
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: select_v32i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: and w8, w2, #0x1
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_512-NEXT: ptrue p1.h
; VBITS_GE_512-NEXT: mov z2.h, w8
; VBITS_GE_512-NEXT: and z2.h, z2.h, #0x1
; VBITS_GE_512-NEXT: cmpne p1.h, p1/z, z2.h, #0
; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load volatile <32 x i16>, <32 x i16>* %a
%op2 = load volatile <32 x i16>, <32 x i16>* %b
%sel = select i1 %mask, <32 x i16> %op1, <32 x i16> %op2
@ -188,19 +221,20 @@ define void @select_v32i16(<32 x i16>* %a, <32 x i16>* %b, i1 %mask) #0 {
ret void
}
define void @select_v64i16(<64 x i16>* %a, <64 x i16>* %b, i1 %mask) #0 {
define void @select_v64i16(<64 x i16>* %a, <64 x i16>* %b, i1 %mask) vscale_range(8,0) #0 {
; CHECK-LABEL: select_v64i16:
; VBITS_GE_1024: and w[[AND:[0-9]+]], w2, #0x1
; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].h
; VBITS_GE_1024-NEXT: mov [[TMP1:z[0-9]+]].h, w[[AND]]
; VBITS_GE_1024-NEXT: and [[TMP2:z[0-9]+]].h, [[TMP1]].h, #0x1
; VBITS_GE_1024-NEXT: cmpne [[PRES:p[0-9]+]].h, [[PG2]]/z, [[TMP2]].h, #0
; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].h, [[PRES]], [[OP1]].h, [[OP2]].h
; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG1]], [x0]
; VBITS_GE_1024-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: ptrue p1.h
; CHECK-NEXT: mov z2.h, w8
; CHECK-NEXT: and z2.h, z2.h, #0x1
; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <64 x i16>, <64 x i16>* %a
%op2 = load volatile <64 x i16>, <64 x i16>* %b
%sel = select i1 %mask, <64 x i16> %op1, <64 x i16> %op2
@ -208,19 +242,20 @@ define void @select_v64i16(<64 x i16>* %a, <64 x i16>* %b, i1 %mask) #0 {
ret void
}
define void @select_v128i16(<128 x i16>* %a, <128 x i16>* %b, i1 %mask) #0 {
define void @select_v128i16(<128 x i16>* %a, <128 x i16>* %b, i1 %mask) vscale_range(16,0) #0 {
; CHECK-LABEL: select_v128i16:
; VBITS_GE_2048: and w[[AND:[0-9]+]], w2, #0x1
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].h
; VBITS_GE_2048-NEXT: mov [[TMP1:z[0-9]+]].h, w[[AND]]
; VBITS_GE_2048-NEXT: and [[TMP2:z[0-9]+]].h, [[TMP1]].h, #0x1
; VBITS_GE_2048-NEXT: cmpne [[PRES:p[0-9]+]].h, [[PG2]]/z, [[TMP2]].h, #0
; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].h, [[PRES]], [[OP1]].h, [[OP2]].h
; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG1]], [x0]
; VBITS_GE_2048-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: ptrue p1.h
; CHECK-NEXT: mov z2.h, w8
; CHECK-NEXT: and z2.h, z2.h, #0x1
; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <128 x i16>, <128 x i16>* %a
%op2 = load volatile <128 x i16>, <128 x i16>* %b
%sel = select i1 %mask, <128 x i16> %op1, <128 x i16> %op2
@ -229,42 +264,45 @@ define void @select_v128i16(<128 x i16>* %a, <128 x i16>* %b, i1 %mask) #0 {
}
; Don't use SVE for 64-bit vectors.
define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) #0 {
define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v2i32:
; CHECK: tst w0, #0x1
; CHECK-NEXT: csetm w8, ne
; CHECK-NEXT: dup v2.2s, w8
; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: csetm w8, ne
; CHECK-NEXT: dup v2.2s, w8
; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
; CHECK-NEXT: ret
%sel = select i1 %mask, <2 x i32> %op1, <2 x i32> %op2
ret <2 x i32> %sel
}
; Don't use SVE for 128-bit vectors.
define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) #0 {
define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4i32:
; CHECK: tst w0, #0x1
; CHECK-NEXT: csetm w8, ne
; CHECK-NEXT: dup v2.4s, w8
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: csetm w8, ne
; CHECK-NEXT: dup v2.4s, w8
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
; CHECK-NEXT: ret
%sel = select i1 %mask, <4 x i32> %op1, <4 x i32> %op2
ret <4 x i32> %sel
}
define void @select_v8i32(<8 x i32>* %a, <8 x i32>* %b, i1 %mask) #0 {
define void @select_v8i32(<8 x i32>* %a, <8 x i32>* %b, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v8i32:
; CHECK: and w[[AND:[0-9]+]], w2, #0x1
; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
; CHECK-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
; CHECK-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].s
; CHECK-NEXT: mov [[TMP1:z[0-9]+]].s, w[[AND]]
; CHECK-NEXT: and [[TMP2:z[0-9]+]].s, [[TMP1]].s, #0x1
; CHECK-NEXT: cmpne [[PRES:p[0-9]+]].s, [[PG2]]/z, [[TMP2]].s, #0
; CHECK-NEXT: sel [[RES:z[0-9]+]].s, [[PRES]], [[OP1]].s, [[OP2]].s
; CHECK-NEXT: st1w { [[RES]].s }, [[PG1]], [x0]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: ptrue p1.s
; CHECK-NEXT: mov z2.s, w8
; CHECK-NEXT: and z2.s, z2.s, #0x1
; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <8 x i32>, <8 x i32>* %a
%op2 = load volatile <8 x i32>, <8 x i32>* %b
%sel = select i1 %mask, <8 x i32> %op1, <8 x i32> %op2
@ -273,18 +311,38 @@ define void @select_v8i32(<8 x i32>* %a, <8 x i32>* %b, i1 %mask) #0 {
}
define void @select_v16i32(<16 x i32>* %a, <16 x i32>* %b, i1 %mask) #0 {
; CHECK-LABEL: select_v16i32:
; VBITS_GE_512: and w[[AND:[0-9]+]], w2, #0x1
; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s
; VBITS_GE_512-NEXT: mov [[TMP1:z[0-9]+]].s, w[[AND]]
; VBITS_GE_512-NEXT: and [[TMP2:z[0-9]+]].s, [[TMP1]].s, #0x1
; VBITS_GE_512-NEXT: cmpne [[PRES:p[0-9]+]].s, [[PG2]]/z, [[TMP2]].s, #0
; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].s, [[PRES]], [[OP1]].s, [[OP2]].s
; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG1]], [x0]
; VBITS_GE_512-NEXT: ret
; VBITS_GE_256-LABEL: select_v16i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: and w9, w2, #0x1
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ptrue p1.s
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: mov z4.s, w9
; VBITS_GE_256-NEXT: and z4.s, z4.s, #0x1
; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z4.s, #0
; VBITS_GE_256-NEXT: sel z1.s, p1, z1.s, z3.s
; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z2.s
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: select_v16i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: and w8, w2, #0x1
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: ptrue p1.s
; VBITS_GE_512-NEXT: mov z2.s, w8
; VBITS_GE_512-NEXT: and z2.s, z2.s, #0x1
; VBITS_GE_512-NEXT: cmpne p1.s, p1/z, z2.s, #0
; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load volatile <16 x i32>, <16 x i32>* %a
%op2 = load volatile <16 x i32>, <16 x i32>* %b
%sel = select i1 %mask, <16 x i32> %op1, <16 x i32> %op2
@ -292,19 +350,20 @@ define void @select_v16i32(<16 x i32>* %a, <16 x i32>* %b, i1 %mask) #0 {
ret void
}
define void @select_v32i32(<32 x i32>* %a, <32 x i32>* %b, i1 %mask) #0 {
define void @select_v32i32(<32 x i32>* %a, <32 x i32>* %b, i1 %mask) vscale_range(8,0) #0 {
; CHECK-LABEL: select_v32i32:
; VBITS_GE_1024: and w[[AND:[0-9]+]], w2, #0x1
; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
; VBITS_GE_1024-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s
; VBITS_GE_1024-NEXT: mov [[TMP1:z[0-9]+]].s, w[[AND]]
; VBITS_GE_1024-NEXT: and [[TMP2:z[0-9]+]].s, [[TMP1]].s, #0x1
; VBITS_GE_1024-NEXT: cmpne [[PRES:p[0-9]+]].s, [[PG2]]/z, [[TMP2]].s, #0
; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].s, [[PRES]], [[OP1]].s, [[OP2]].s
; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG1]], [x0]
; VBITS_GE_1024-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: ptrue p1.s
; CHECK-NEXT: mov z2.s, w8
; CHECK-NEXT: and z2.s, z2.s, #0x1
; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <32 x i32>, <32 x i32>* %a
%op2 = load volatile <32 x i32>, <32 x i32>* %b
%sel = select i1 %mask, <32 x i32> %op1, <32 x i32> %op2
@ -312,19 +371,20 @@ define void @select_v32i32(<32 x i32>* %a, <32 x i32>* %b, i1 %mask) #0 {
ret void
}
define void @select_v64i32(<64 x i32>* %a, <64 x i32>* %b, i1 %mask) #0 {
define void @select_v64i32(<64 x i32>* %a, <64 x i32>* %b, i1 %mask) vscale_range(16,0) #0 {
; CHECK-LABEL: select_v64i32:
; VBITS_GE_2048: and w[[AND:[0-9]+]], w2, #0x1
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
; VBITS_GE_2048-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s
; VBITS_GE_2048-NEXT: mov [[TMP1:z[0-9]+]].s, w[[AND]]
; VBITS_GE_2048-NEXT: and [[TMP2:z[0-9]+]].s, [[TMP1]].s, #0x1
; VBITS_GE_2048-NEXT: cmpne [[PRES:p[0-9]+]].s, [[PG2]]/z, [[TMP2]].s, #0
; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].s, [[PRES]], [[OP1]].s, [[OP2]].s
; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG1]], [x0]
; VBITS_GE_2048-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: ptrue p1.s
; CHECK-NEXT: mov z2.s, w8
; CHECK-NEXT: and z2.s, z2.s, #0x1
; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <64 x i32>, <64 x i32>* %a
%op2 = load volatile <64 x i32>, <64 x i32>* %b
%sel = select i1 %mask, <64 x i32> %op1, <64 x i32> %op2
@ -333,42 +393,45 @@ define void @select_v64i32(<64 x i32>* %a, <64 x i32>* %b, i1 %mask) #0 {
}
; Don't use SVE for 64-bit vectors.
define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) #0 {
define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v1i64:
; CHECK: tst w0, #0x1
; CHECK-NEXT: csetm x8, ne
; CHECK-NEXT: fmov d2, x8
; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: csetm x8, ne
; CHECK-NEXT: fmov d2, x8
; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
; CHECK-NEXT: ret
%sel = select i1 %mask, <1 x i64> %op1, <1 x i64> %op2
ret <1 x i64> %sel
}
; Don't use SVE for 128-bit vectors.
define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) #0 {
define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v2i64:
; CHECK: tst w0, #0x1
; CHECK-NEXT: csetm x8, ne
; CHECK-NEXT: dup v2.2d, x8
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: csetm x8, ne
; CHECK-NEXT: dup v2.2d, x8
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
; CHECK-NEXT: ret
%sel = select i1 %mask, <2 x i64> %op1, <2 x i64> %op2
ret <2 x i64> %sel
}
define void @select_v4i64(<4 x i64>* %a, <4 x i64>* %b, i1 %mask) #0 {
define void @select_v4i64(<4 x i64>* %a, <4 x i64>* %b, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4i64:
; CHECK: and w[[AND:[0-9]+]], w2, #0x1
; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
; CHECK-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
; CHECK-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].d
; CHECK-NEXT: mov [[TMP1:z[0-9]+]].d, x[[AND]]
; CHECK-NEXT: and [[TMP2:z[0-9]+]].d, [[TMP1]].d, #0x1
; CHECK-NEXT: cmpne [[PRES:p[0-9]+]].d, [[PG2]]/z, [[TMP2]].d, #0
; CHECK-NEXT: sel [[RES:z[0-9]+]].d, [[PRES]], [[OP1]].d, [[OP2]].d
; CHECK-NEXT: st1d { [[RES]].d }, [[PG1]], [x0]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: mov z2.d, x8
; CHECK-NEXT: and z2.d, z2.d, #0x1
; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <4 x i64>, <4 x i64>* %a
%op2 = load volatile <4 x i64>, <4 x i64>* %b
%sel = select i1 %mask, <4 x i64> %op1, <4 x i64> %op2
@ -377,18 +440,38 @@ define void @select_v4i64(<4 x i64>* %a, <4 x i64>* %b, i1 %mask) #0 {
}
define void @select_v8i64(<8 x i64>* %a, <8 x i64>* %b, i1 %mask) #0 {
; CHECK-LABEL: select_v8i64:
; VBITS_GE_512: and w[[AND:[0-9]+]], w2, #0x1
; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d
; VBITS_GE_512-NEXT: mov [[TMP1:z[0-9]+]].d, x[[AND]]
; VBITS_GE_512-NEXT: and [[TMP2:z[0-9]+]].d, [[TMP1]].d, #0x1
; VBITS_GE_512-NEXT: cmpne [[PRES:p[0-9]+]].d, [[PG2]]/z, [[TMP2]].d, #0
; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].d, [[PRES]], [[OP1]].d, [[OP2]].d
; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG1]], [x0]
; VBITS_GE_512-NEXT: ret
; VBITS_GE_256-LABEL: select_v8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: and w9, w2, #0x1
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ptrue p1.d
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: mov z4.d, x9
; VBITS_GE_256-NEXT: and z4.d, z4.d, #0x1
; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z4.d, #0
; VBITS_GE_256-NEXT: sel z1.d, p1, z1.d, z3.d
; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z2.d
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: select_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: and w8, w2, #0x1
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: ptrue p1.d
; VBITS_GE_512-NEXT: mov z2.d, x8
; VBITS_GE_512-NEXT: and z2.d, z2.d, #0x1
; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z2.d, #0
; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load volatile <8 x i64>, <8 x i64>* %a
%op2 = load volatile <8 x i64>, <8 x i64>* %b
%sel = select i1 %mask, <8 x i64> %op1, <8 x i64> %op2
@ -396,19 +479,20 @@ define void @select_v8i64(<8 x i64>* %a, <8 x i64>* %b, i1 %mask) #0 {
ret void
}
define void @select_v16i64(<16 x i64>* %a, <16 x i64>* %b, i1 %mask) #0 {
define void @select_v16i64(<16 x i64>* %a, <16 x i64>* %b, i1 %mask) vscale_range(8,0) #0 {
; CHECK-LABEL: select_v16i64:
; VBITS_GE_1024: and w[[AND:[0-9]+]], w2, #0x1
; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
; VBITS_GE_1024-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d
; VBITS_GE_1024-NEXT: mov [[TMP1:z[0-9]+]].d, x[[AND]]
; VBITS_GE_1024-NEXT: and [[TMP2:z[0-9]+]].d, [[TMP1]].d, #0x1
; VBITS_GE_1024-NEXT: cmpne [[PRES:p[0-9]+]].d, [[PG2]]/z, [[TMP2]].d, #0
; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].d, [[PRES]], [[OP1]].d, [[OP2]].d
; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG1]], [x0]
; VBITS_GE_1024-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: mov z2.d, x8
; CHECK-NEXT: and z2.d, z2.d, #0x1
; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <16 x i64>, <16 x i64>* %a
%op2 = load volatile <16 x i64>, <16 x i64>* %b
%sel = select i1 %mask, <16 x i64> %op1, <16 x i64> %op2
@ -416,19 +500,20 @@ define void @select_v16i64(<16 x i64>* %a, <16 x i64>* %b, i1 %mask) #0 {
ret void
}
define void @select_v32i64(<32 x i64>* %a, <32 x i64>* %b, i1 %mask) #0 {
define void @select_v32i64(<32 x i64>* %a, <32 x i64>* %b, i1 %mask) vscale_range(16,0) #0 {
; CHECK-LABEL: select_v32i64:
; VBITS_GE_2048: and w[[AND:[0-9]+]], w2, #0x1
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
; VBITS_GE_2048-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d
; VBITS_GE_2048-NEXT: mov [[TMP1:z[0-9]+]].d, x[[AND]]
; VBITS_GE_2048-NEXT: and [[TMP2:z[0-9]+]].d, [[TMP1]].d, #0x1
; VBITS_GE_2048-NEXT: cmpne [[PRES:p[0-9]+]].d, [[PG2]]/z, [[TMP2]].d, #0
; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].d, [[PRES]], [[OP1]].d, [[OP2]].d
; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG1]], [x0]
; VBITS_GE_2048-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: mov z2.d, x8
; CHECK-NEXT: and z2.d, z2.d, #0x1
; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <32 x i64>, <32 x i64>* %a
%op2 = load volatile <32 x i64>, <32 x i64>* %b
%sel = select i1 %mask, <32 x i64> %op1, <32 x i64> %op2

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,35 +1,29 @@
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK
; VBYTES represents the useful byte size of a vector register from the code
; generator's point of view. It is clamped to power-of-2 values because
; only power-of-2 vector lengths are considered legal, regardless of the
; user specified vector length.
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048
target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE when its registers are no bigger than NEON.
; NO_SVE-NOT: ptrue
; Don't use SVE for 64-bit vectors.
define <2 x float> @load_v2f32(<2 x float>* %a) #0 {
; CHECK-LABEL: load_v2f32:
; CHECK: ldr d0, [x0]
; CHECK: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ret
%load = load <2 x float>, <2 x float>* %a
ret <2 x float> %load
}
@ -37,66 +31,164 @@ define <2 x float> @load_v2f32(<2 x float>* %a) #0 {
; Don't use SVE for 128-bit vectors.
define <4 x float> @load_v4f32(<4 x float>* %a) #0 {
; CHECK-LABEL: load_v4f32:
; CHECK: ldr q0, [x0]
; CHECK: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ret
%load = load <4 x float>, <4 x float>* %a
ret <4 x float> %load
}
define <8 x float> @load_v8f32(<8 x float>* %a) #0 {
; CHECK-LABEL: load_v8f32:
; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
; CHECK: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0]
; CHECK: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: st1w { z0.s }, p0, [x8]
; CHECK-NEXT: ret
%load = load <8 x float>, <8 x float>* %a
ret <8 x float> %load
}
define <16 x float> @load_v16f32(<16 x float>* %a) #0 {
; CHECK-LABEL: load_v16f32:
; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
; CHECK-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0]
; VBITS_LE_256-DAG: mov x[[A1:[0-9]+]], #[[#div(VBYTES,4)]]
; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A1]], lsl #2]
; CHECK: ret
; VBITS_GE_256-LABEL: load_v16f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: load_v16f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_512-NEXT: ret
;
; VBITS_GE_1024-LABEL: load_v16f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_v16f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl16
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_2048-NEXT: ret
%load = load <16 x float>, <16 x float>* %a
ret <16 x float> %load
}
define <32 x float> @load_v32f32(<32 x float>* %a) #0 {
; CHECK-LABEL: load_v32f32:
; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
; CHECK-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0]
; VBITS_LE_512-DAG: mov x[[A1:[0-9]+]], #[[#div(VBYTES,4)]]
; VBITS_LE_512-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A1]], lsl #2]
; VBITS_LE_256-DAG: mov x[[A2:[0-9]+]], #[[#mul(div(VBYTES,4),2)]]
; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A2]], lsl #2]
; VBITS_LE_256-DAG: mov x[[A3:[0-9]+]], #[[#mul(div(VBYTES,4),3)]]
; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A3]], lsl #2]
; CHECK: ret
; VBITS_GE_256-LABEL: load_v32f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: mov x10, #24
; VBITS_GE_256-NEXT: mov x11, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x8]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: load_v32f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov x9, #16
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_512-NEXT: st1w { z1.s }, p0, [x8]
; VBITS_GE_512-NEXT: ret
;
; VBITS_GE_1024-LABEL: load_v32f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_v32f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_2048-NEXT: ret
%load = load <32 x float>, <32 x float>* %a
ret <32 x float> %load
}
define <64 x float> @load_v64f32(<64 x float>* %a) #0 {
; CHECK-LABEL: load_v64f32:
; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
; CHECK-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0]
; VBITS_LE_1024-DAG: mov x[[A1:[0-9]+]], #[[#div(VBYTES,4)]]
; VBITS_LE_1024-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A1]], lsl #2]
; VBITS_LE_512-DAG: mov x[[A2:[0-9]+]], #[[#mul(div(VBYTES,4),2)]]
; VBITS_LE_512-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A2]], lsl #2]
; VBITS_LE_512-DAG: mov x[[A3:[0-9]+]], #[[#mul(div(VBYTES,4),3)]]
; VBITS_LE_512-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A3]], lsl #2]
; VBITS_LE_256-DAG: mov x[[A4:[0-9]+]], #[[#mul(div(VBYTES,4),4)]]
; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A4]], lsl #2]
; VBITS_LE_256-DAG: mov x[[A5:[0-9]+]], #[[#mul(div(VBYTES,4),5)]]
; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A5]], lsl #2]
; VBITS_LE_256-DAG: mov x[[A6:[0-9]+]], #[[#mul(div(VBYTES,4),6)]]
; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A6]], lsl #2]
; VBITS_LE_256-DAG: mov x[[A7:[0-9]+]], #[[#mul(div(VBYTES,4),7)]]
; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A7]], lsl #2]
; CHECK: ret
; VBITS_GE_256-LABEL: load_v64f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: mov x10, #48
; VBITS_GE_256-NEXT: mov x11, #56
; VBITS_GE_256-NEXT: mov x12, #32
; VBITS_GE_256-NEXT: mov x13, #40
; VBITS_GE_256-NEXT: mov x14, #16
; VBITS_GE_256-NEXT: mov x15, #24
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x13, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x14, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x15, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x8, x13, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x8, x15, lsl #2]
; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x8, x14, lsl #2]
; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x8]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: load_v64f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov x9, #32
; VBITS_GE_512-NEXT: mov x10, #48
; VBITS_GE_512-NEXT: mov x11, #16
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_512-NEXT: ld1w { z3.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1w { z1.s }, p0, [x8, x10, lsl #2]
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_512-NEXT: st1w { z2.s }, p0, [x8, x11, lsl #2]
; VBITS_GE_512-NEXT: st1w { z3.s }, p0, [x8]
; VBITS_GE_512-NEXT: ret
;
; VBITS_GE_1024-LABEL: load_v64f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: mov x9, #32
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_v64f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_2048-NEXT: ret
%load = load <64 x float>, <64 x float>* %a
ret <64 x float> %load
}

File diff suppressed because it is too large Load Diff

View File

@ -1,18 +1,7 @@
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
@ -20,7 +9,7 @@ target triple = "aarch64-unknown-linux-gnu"
; LD1B
;
define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) #0 {
define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v2i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x1]
@ -36,7 +25,7 @@ define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) #0 {
ret void
}
define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 {
define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v4i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@ -54,21 +43,21 @@ define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 {
}
define void @masked_gather_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 {
; VBITS_EQ_256-LABEL: masked_gather_v8i8:
; VBITS_EQ_256: // %bb.0:
; VBITS_EQ_256-NEXT: mov x8, #4
; VBITS_EQ_256-NEXT: ptrue p0.d, vl4
; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_EQ_256-NEXT: ld1b { z0.d }, p0/z, [z0.d]
; VBITS_EQ_256-NEXT: ld1b { z1.d }, p0/z, [z1.d]
; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s
; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h
; VBITS_EQ_256-NEXT: uzp1 v0.8b, v1.8b, v0.8b
; VBITS_EQ_256-NEXT: str d0, [x0]
; VBITS_EQ_256-NEXT: ret
; VBITS_GE_256-LABEL: masked_gather_v8i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: ld1b { z0.d }, p0/z, [z0.d]
; VBITS_GE_256-NEXT: ld1b { z1.d }, p0/z, [z1.d]
; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
; VBITS_GE_256-NEXT: uzp1 v0.8b, v1.8b, v0.8b
; VBITS_GE_256-NEXT: str d0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_gather_v8i8:
; VBITS_GE_512: // %bb.0:
@ -86,17 +75,17 @@ define void @masked_gather_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 {
ret void
}
define void @masked_gather_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 {
; VBITS_GE_1024-LABEL: masked_gather_v16i8:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x1]
; VBITS_GE_1024-NEXT: ld1b { z0.d }, p0/z, [z0.d]
; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_1024-NEXT: uzp1 z0.b, z0.b, z0.b
; VBITS_GE_1024-NEXT: str q0, [x0]
; VBITS_GE_1024-NEXT: ret
define void @masked_gather_v16i8(<16 x i8>* %a, <16 x i8*>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: masked_gather_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1]
; CHECK-NEXT: ld1b { z0.d }, p0/z, [z0.d]
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ret
%ptrs = load <16 x i8*>, <16 x i8*>* %b
%vals = call <16 x i8> @llvm.masked.gather.v16i8(<16 x i8*> %ptrs, i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
@ -104,18 +93,18 @@ define void @masked_gather_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 {
ret void
}
define void @masked_gather_v32i8(<32 x i8>* %a, <32 x i8*>* %b) #0 {
; VBITS_GE_2048-LABEL: masked_gather_v32i8:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x1]
; VBITS_GE_2048-NEXT: ld1b { z0.d }, p0/z, [z0.d]
; VBITS_GE_2048-NEXT: ptrue p0.b, vl32
; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_2048-NEXT: uzp1 z0.b, z0.b, z0.b
; VBITS_GE_2048-NEXT: st1b { z0.b }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
define void @masked_gather_v32i8(<32 x i8>* %a, <32 x i8*>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: masked_gather_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1]
; CHECK-NEXT: ld1b { z0.d }, p0/z, [z0.d]
; CHECK-NEXT: ptrue p0.b, vl32
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%ptrs = load <32 x i8*>, <32 x i8*>* %b
%vals = call <32 x i8> @llvm.masked.gather.v32i8(<32 x i8*> %ptrs, i32 8, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
@ -129,7 +118,7 @@ define void @masked_gather_v32i8(<32 x i8>* %a, <32 x i8*>* %b) #0 {
; LD1H
;
define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) #0 {
define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x1]
@ -145,7 +134,7 @@ define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) #0 {
ret void
}
define void @masked_gather_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 {
define void @masked_gather_v4i16(<4 x i16>* %a, <4 x i16*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@ -162,21 +151,21 @@ define void @masked_gather_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 {
}
define void @masked_gather_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 {
; VBITS_EQ_256-LABEL: masked_gather_v8i16:
; VBITS_EQ_256: // %bb.0:
; VBITS_EQ_256-NEXT: mov x8, #4
; VBITS_EQ_256-NEXT: ptrue p0.d, vl4
; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_EQ_256-NEXT: ld1h { z0.d }, p0/z, [z0.d]
; VBITS_EQ_256-NEXT: ld1h { z1.d }, p0/z, [z1.d]
; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s
; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h
; VBITS_EQ_256-NEXT: mov v1.d[1], v0.d[0]
; VBITS_EQ_256-NEXT: str q1, [x0]
; VBITS_EQ_256-NEXT: ret
; VBITS_GE_256-LABEL: masked_gather_v8i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: ld1h { z0.d }, p0/z, [z0.d]
; VBITS_GE_256-NEXT: ld1h { z1.d }, p0/z, [z1.d]
; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
; VBITS_GE_256-NEXT: str q1, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_gather_v8i16:
; VBITS_GE_512: // %bb.0:
@ -193,17 +182,17 @@ define void @masked_gather_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 {
ret void
}
define void @masked_gather_v16i16(<16 x i16>* %a, <16 x i16*>* %b) #0 {
; VBITS_GE_1024-LABEL: masked_gather_v16i16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x1]
; VBITS_GE_1024-NEXT: ld1h { z0.d }, p0/z, [z0.d]
; VBITS_GE_1024-NEXT: ptrue p0.h, vl16
; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
define void @masked_gather_v16i16(<16 x i16>* %a, <16 x i16*>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: masked_gather_v16i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1]
; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d]
; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%ptrs = load <16 x i16*>, <16 x i16*>* %b
%vals = call <16 x i16> @llvm.masked.gather.v16i16(<16 x i16*> %ptrs, i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i16> undef)
@ -211,17 +200,17 @@ define void @masked_gather_v16i16(<16 x i16>* %a, <16 x i16*>* %b) #0 {
ret void
}
define void @masked_gather_v32i16(<32 x i16>* %a, <32 x i16*>* %b) #0 {
; VBITS_GE_2048-LABEL: masked_gather_v32i16:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x1]
; VBITS_GE_2048-NEXT: ld1h { z0.d }, p0/z, [z0.d]
; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
define void @masked_gather_v32i16(<32 x i16>* %a, <32 x i16*>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: masked_gather_v32i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1]
; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d]
; CHECK-NEXT: ptrue p0.h, vl32
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%ptrs = load <32 x i16*>, <32 x i16*>* %b
%vals = call <32 x i16> @llvm.masked.gather.v32i16(<32 x i16*> %ptrs, i32 8, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
@ -235,7 +224,7 @@ define void @masked_gather_v32i16(<32 x i16>* %a, <32 x i16*>* %b) #0 {
; LD1W
;
define void @masked_gather_v2i32(<2 x i32>* %a, <2 x i32*>* %b) #0 {
define void @masked_gather_v2i32(<2 x i32>* %a, <2 x i32*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x1]
@ -250,7 +239,7 @@ define void @masked_gather_v2i32(<2 x i32>* %a, <2 x i32*>* %b) #0 {
ret void
}
define void @masked_gather_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 {
define void @masked_gather_v4i32(<4 x i32>* %a, <4 x i32*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@ -266,21 +255,21 @@ define void @masked_gather_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 {
}
define void @masked_gather_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 {
; VBITS_EQ_256-LABEL: masked_gather_v8i32:
; VBITS_EQ_256: // %bb.0:
; VBITS_EQ_256-NEXT: mov x8, #4
; VBITS_EQ_256-NEXT: ptrue p0.d, vl4
; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_EQ_256-NEXT: ld1w { z0.d }, p0/z, [z0.d]
; VBITS_EQ_256-NEXT: ld1w { z1.d }, p0/z, [z1.d]
; VBITS_EQ_256-NEXT: ptrue p0.s, vl4
; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s
; VBITS_EQ_256-NEXT: splice z1.s, p0, z1.s, z0.s
; VBITS_EQ_256-NEXT: ptrue p0.s, vl8
; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_EQ_256-NEXT: ret
; VBITS_GE_256-LABEL: masked_gather_v8i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: ld1w { z0.d }, p0/z, [z0.d]
; VBITS_GE_256-NEXT: ld1w { z1.d }, p0/z, [z1.d]
; VBITS_GE_256-NEXT: ptrue p0.s, vl4
; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_gather_v8i32:
; VBITS_GE_512: // %bb.0:
@ -297,16 +286,16 @@ define void @masked_gather_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 {
ret void
}
define void @masked_gather_v16i32(<16 x i32>* %a, <16 x i32*>* %b) #0 {
; VBITS_GE_1024-LABEL: masked_gather_v16i32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x1]
; VBITS_GE_1024-NEXT: ld1w { z0.d }, p0/z, [z0.d]
; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
define void @masked_gather_v16i32(<16 x i32>* %a, <16 x i32*>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: masked_gather_v16i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1]
; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d]
; CHECK-NEXT: ptrue p0.s, vl16
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%ptrs = load <16 x i32*>, <16 x i32*>* %b
%vals = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
@ -314,16 +303,16 @@ define void @masked_gather_v16i32(<16 x i32>* %a, <16 x i32*>* %b) #0 {
ret void
}
define void @masked_gather_v32i32(<32 x i32>* %a, <32 x i32*>* %b) #0 {
; VBITS_GE_2048-LABEL: masked_gather_v32i32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x1]
; VBITS_GE_2048-NEXT: ld1w { z0.d }, p0/z, [z0.d]
; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
define void @masked_gather_v32i32(<32 x i32>* %a, <32 x i32*>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: masked_gather_v32i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1]
; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d]
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%ptrs = load <32 x i32*>, <32 x i32*>* %b
%vals = call <32 x i32> @llvm.masked.gather.v32i32(<32 x i32*> %ptrs, i32 8, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
@ -337,7 +326,7 @@ define void @masked_gather_v32i32(<32 x i32>* %a, <32 x i32*>* %b) #0 {
; LD1D
;
define void @masked_gather_v2i64(<2 x i64>* %a, <2 x i64*>* %b) #0 {
define void @masked_gather_v2i64(<2 x i64>* %a, <2 x i64*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x1]
@ -351,7 +340,7 @@ define void @masked_gather_v2i64(<2 x i64>* %a, <2 x i64*>* %b) #0 {
ret void
}
define void @masked_gather_v4i64(<4 x i64>* %a, <4 x i64*>* %b) #0 {
define void @masked_gather_v4i64(<4 x i64>* %a, <4 x i64*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@ -366,17 +355,17 @@ define void @masked_gather_v4i64(<4 x i64>* %a, <4 x i64*>* %b) #0 {
}
define void @masked_gather_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 {
; VBITS_EQ_256-LABEL: masked_gather_v8i64:
; VBITS_EQ_256: // %bb.0:
; VBITS_EQ_256-NEXT: mov x8, #4
; VBITS_EQ_256-NEXT: ptrue p0.d, vl4
; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [z0.d]
; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [z1.d]
; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_EQ_256-NEXT: ret
; VBITS_GE_256-LABEL: masked_gather_v8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [z0.d]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [z1.d]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: masked_gather_v8i64:
; VBITS_GE_512: // %bb.0:
@ -391,14 +380,14 @@ define void @masked_gather_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 {
ret void
}
define void @masked_gather_v16i64(<16 x i64>* %a, <16 x i64*>* %b) #0 {
; VBITS_GE_1024-LABEL: masked_gather_v16i64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x1]
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [z0.d]
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
define void @masked_gather_v16i64(<16 x i64>* %a, <16 x i64*>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: masked_gather_v16i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1]
; CHECK-NEXT: ld1d { z0.d }, p0/z, [z0.d]
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%ptrs = load <16 x i64*>, <16 x i64*>* %b
%vals = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i64> undef)
@ -406,14 +395,14 @@ define void @masked_gather_v16i64(<16 x i64>* %a, <16 x i64*>* %b) #0 {
ret void
}
define void @masked_gather_v32i64(<32 x i64>* %a, <32 x i64*>* %b) #0 {
; VBITS_GE_2048-LABEL: masked_gather_v32i64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x1]
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [z0.d]
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
define void @masked_gather_v32i64(<32 x i64>* %a, <32 x i64*>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: masked_gather_v32i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1]
; CHECK-NEXT: ld1d { z0.d }, p0/z, [z0.d]
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%ptrs = load <32 x i64*>, <32 x i64*>* %b
%vals = call <32 x i64> @llvm.masked.gather.v32i64(<32 x i64*> %ptrs, i32 8, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,28 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; Don't use SVE when its registers are no bigger than NEON.
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
;;
;; Masked Stores
;;
define void @masked_store_v2f16(<2 x half>* %ap, <2 x half>* %bp) #0 {
;
; Masked Stores
;
define void @masked_store_v2f16(<2 x half>* %ap, <2 x half>* %bp) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_store_v2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr s1, [x0]
@ -52,8 +39,7 @@ define void @masked_store_v2f16(<2 x half>* %ap, <2 x half>* %bp) #0 {
ret void
}
define void @masked_store_v2f32(<2 x float>* %ap, <2 x float>* %bp) #0 {
define void @masked_store_v2f32(<2 x float>* %ap, <2 x float>* %bp) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_store_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
@ -70,7 +56,7 @@ define void @masked_store_v2f32(<2 x float>* %ap, <2 x float>* %bp) #0 {
ret void
}
define void @masked_store_v4f32(<4 x float>* %ap, <4 x float>* %bp) #0 {
define void @masked_store_v4f32(<4 x float>* %ap, <4 x float>* %bp) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_store_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@ -87,7 +73,7 @@ define void @masked_store_v4f32(<4 x float>* %ap, <4 x float>* %bp) #0 {
ret void
}
define void @masked_store_v8f32(<8 x float>* %ap, <8 x float>* %bp) #0 {
define void @masked_store_v8f32(<8 x float>* %ap, <8 x float>* %bp) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_store_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@ -133,39 +119,15 @@ define void @masked_store_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0 {
ret void
}
define void @masked_store_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0 {
; VBITS_GE_256-LABEL: masked_store_v32f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #24
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: mov x10, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z2.s, z5.s
; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z4.s
; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z0.s, z6.s
; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z3.s, z7.s
; VBITS_GE_256-NEXT: st1w { z0.s }, p3, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p2, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p1, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: masked_store_v32f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_1024-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
define void @masked_store_v32f32(<32 x float>* %ap, <32 x float>* %bp) vscale_range(8,0) #0 {
; CHECK-LABEL: masked_store_v32f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%a = load <32 x float>, <32 x float>* %ap
%b = load <32 x float>, <32 x float>* %bp
%mask = fcmp oeq <32 x float> %a, %b
@ -173,59 +135,15 @@ define void @masked_store_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0 {
ret void
}
define void @masked_store_v64f32(<64 x float>* %ap, <64 x float>* %bp) #0 {
; VBITS_GE_256-LABEL: masked_store_v64f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #56
; VBITS_GE_256-NEXT: mov x9, #48
; VBITS_GE_256-NEXT: mov x10, #40
; VBITS_GE_256-NEXT: mov x11, #32
; VBITS_GE_256-NEXT: mov x12, #24
; VBITS_GE_256-NEXT: mov x13, #16
; VBITS_GE_256-NEXT: mov x14, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1, x14, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z18.s }, p0/z, [x1, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z19.s }, p0/z, [x1, x12, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z20.s }, p0/z, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z21.s }, p0/z, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z22.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z23.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z6.s, z17.s
; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z5.s, z16.s
; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z4.s, z19.s
; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z3.s, z18.s
; VBITS_GE_256-NEXT: fcmeq p5.s, p0/z, z2.s, z21.s
; VBITS_GE_256-NEXT: fcmeq p6.s, p0/z, z1.s, z20.s
; VBITS_GE_256-NEXT: fcmeq p7.s, p0/z, z0.s, z22.s
; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z7.s, z23.s
; VBITS_GE_256-NEXT: st1w { z0.s }, p7, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p6, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p5, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p4, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z4.s }, p3, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z5.s }, p2, [x0, x13, lsl #2]
; VBITS_GE_256-NEXT: st1w { z6.s }, p1, [x0, x14, lsl #2]
; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: masked_store_v64f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
define void @masked_store_v64f32(<64 x float>* %ap, <64 x float>* %bp) vscale_range(16,0) #0 {
; CHECK-LABEL: masked_store_v64f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%a = load <64 x float>, <64 x float>* %ap
%b = load <64 x float>, <64 x float>* %bp
%mask = fcmp oeq <64 x float> %a, %b
@ -266,7 +184,6 @@ define void @masked_store_trunc_v8i64i8(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i8>
; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d
; VBITS_GE_512-NEXT: st1b { z0.d }, p0, [x2]
; VBITS_GE_512-NEXT: ret
%a = load <8 x i64>, <8 x i64>* %ap
%b = load <8 x i64>, <8 x i64>* %bp
%mask = icmp eq <8 x i64> %a, %b

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
@ -163,27 +163,27 @@ define void @test_revwv8i32v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
; REVH pattern for shuffle v32i16 with 256 bits and 512 bits SVE.
define void @test_revhv32i16(<32 x i16>* %a) #0 {
; VBITS_EQ_256-LABEL: test_revhv32i16:
; VBITS_EQ_256: // %bb.0:
; VBITS_EQ_256-NEXT: mov x8, #16
; VBITS_EQ_256-NEXT: ptrue p0.h, vl16
; VBITS_EQ_256-NEXT: ptrue p1.d
; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_EQ_256-NEXT: revh z0.d, p1/m, z0.d
; VBITS_EQ_256-NEXT: revh z1.d, p1/m, z1.d
; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_EQ_256-NEXT: ret
;
; VBITS_GE_256-LABEL: test_revhv32i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.h, vl32
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ptrue p1.d
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: revh z0.d, p1/m, z0.d
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_256-NEXT: revh z1.d, p1/m, z1.d
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: test_revhv32i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ptrue p1.d
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: revh z0.d, p1/m, z0.d
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%tmp1 = load <32 x i16>, <32 x i16>* %a
%tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 undef, i32 undef, i32 undef, i32 31, i32 30, i32 29, i32 undef>
store <32 x i16> %tmp2, <32 x i16>* %a

View File

@ -1,54 +1,46 @@
; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE when its registers are no bigger than NEON.
; NO_SVE-NOT: ptrue
;
; RBIT
;
define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) #0 {
define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v8i8:
; CHECK: ptrue [[PG:p[0-9]+]].b, vl8
; CHECK-NEXT: rbit z0.b, [[PG]]/m, z0.b
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ptrue p0.b, vl8
; CHECK-NEXT: rbit z0.b, p0/m, z0.b
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %op)
ret <8 x i8> %res
}
define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) #0 {
define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v16i8:
; CHECK: ptrue [[PG:p[0-9]+]].b, vl16
; CHECK-NEXT: rbit z0.b, [[PG]]/m, z0.b
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ptrue p0.b, vl16
; CHECK-NEXT: rbit z0.b, p0/m, z0.b
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %op)
ret <16 x i8> %res
}
define void @bitreverse_v32i8(<32 x i8>* %a) #0 {
define void @bitreverse_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v32i8:
; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
; CHECK-NEXT: rbit [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl32
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: rbit z0.b, p0/m, z0.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op = load <32 x i8>, <32 x i8>* %a
%res = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %op)
store <32 x i8> %res, <32 x i8>* %a
@ -56,80 +48,91 @@ define void @bitreverse_v32i8(<32 x i8>* %a) #0 {
}
define void @bitreverse_v64i8(<64 x i8>* %a) #0 {
; CHECK-LABEL: bitreverse_v64i8:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_GE_512-NEXT: rbit [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
; VBITS_GE_512-NEXT: ret
; VBITS_GE_256-LABEL: bitreverse_v64i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov w8, #32
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
; VBITS_GE_256-NEXT: rbit z0.b, p0/m, z0.b
; VBITS_GE_256-NEXT: rbit z1.b, p0/m, z1.b
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; Ensure sensible type legalisation.
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
; VBITS_EQ_256-DAG: ld1b { [[OP_LO:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_EQ_256-DAG: ld1b { [[OP_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
; VBITS_EQ_256-DAG: rbit [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP_LO]].b
; VBITS_EQ_256-DAG: rbit [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP_HI]].b
; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0]
; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
; VBITS_EQ_256-NEXT: ret
; VBITS_GE_512-LABEL: bitreverse_v64i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.b, vl64
; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_512-NEXT: rbit z0.b, p0/m, z0.b
; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op = load <64 x i8>, <64 x i8>* %a
%res = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %op)
store <64 x i8> %res, <64 x i8>* %a
ret void
}
define void @bitreverse_v128i8(<128 x i8>* %a) #0 {
define void @bitreverse_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: bitreverse_v128i8:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_GE_1024-NEXT: rbit [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
; VBITS_GE_1024-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl128
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: rbit z0.b, p0/m, z0.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op = load <128 x i8>, <128 x i8>* %a
%res = call <128 x i8> @llvm.bitreverse.v128i8(<128 x i8> %op)
store <128 x i8> %res, <128 x i8>* %a
ret void
}
define void @bitreverse_v256i8(<256 x i8>* %a) #0 {
define void @bitreverse_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: bitreverse_v256i8:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_GE_2048-NEXT: rbit [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b
; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
; VBITS_GE_2048-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl256
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: rbit z0.b, p0/m, z0.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op = load <256 x i8>, <256 x i8>* %a
%res = call <256 x i8> @llvm.bitreverse.v256i8(<256 x i8> %op)
store <256 x i8> %res, <256 x i8>* %a
ret void
}
define <4 x i16> @bitreverse_v4i16(<4 x i16> %op) #0 {
define <4 x i16> @bitreverse_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v4i16:
; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
; CHECK-NEXT: rbit z0.h, [[PG]]/m, z0.h
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ptrue p0.h, vl4
; CHECK-NEXT: rbit z0.h, p0/m, z0.h
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = call <4 x i16> @llvm.bitreverse.v4i16(<4 x i16> %op)
ret <4 x i16> %res
}
define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) #0 {
define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v8i16:
; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
; CHECK-NEXT: rbit z0.h, [[PG]]/m, z0.h
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ptrue p0.h, vl8
; CHECK-NEXT: rbit z0.h, p0/m, z0.h
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %op)
ret <8 x i16> %res
}
define void @bitreverse_v16i16(<16 x i16>* %a) #0 {
define void @bitreverse_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v16i16:
; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
; CHECK-NEXT: rbit [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: rbit z0.h, p0/m, z0.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op = load <16 x i16>, <16 x i16>* %a
%res = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %op)
store <16 x i16> %res, <16 x i16>* %a
@ -137,80 +140,91 @@ define void @bitreverse_v16i16(<16 x i16>* %a) #0 {
}
define void @bitreverse_v32i16(<32 x i16>* %a) #0 {
; CHECK-LABEL: bitreverse_v32i16:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_512-NEXT: rbit [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation.
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
; VBITS_EQ_256-DAG: rbit [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
; VBITS_EQ_256-DAG: rbit [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
; VBITS_EQ_256-NEXT: ret
; VBITS_GE_256-LABEL: bitreverse_v32i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: rbit z0.h, p0/m, z0.h
; VBITS_GE_256-NEXT: rbit z1.h, p0/m, z1.h
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: bitreverse_v32i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: rbit z0.h, p0/m, z0.h
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op = load <32 x i16>, <32 x i16>* %a
%res = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %op)
store <32 x i16> %res, <32 x i16>* %a
ret void
}
define void @bitreverse_v64i16(<64 x i16>* %a) #0 {
define void @bitreverse_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: bitreverse_v64i16:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_1024-NEXT: rbit [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
; VBITS_GE_1024-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: rbit z0.h, p0/m, z0.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op = load <64 x i16>, <64 x i16>* %a
%res = call <64 x i16> @llvm.bitreverse.v64i16(<64 x i16> %op)
store <64 x i16> %res, <64 x i16>* %a
ret void
}
define void @bitreverse_v128i16(<128 x i16>* %a) #0 {
define void @bitreverse_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: bitreverse_v128i16:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_2048-NEXT: rbit [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
; VBITS_GE_2048-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: rbit z0.h, p0/m, z0.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op = load <128 x i16>, <128 x i16>* %a
%res = call <128 x i16> @llvm.bitreverse.v128i16(<128 x i16> %op)
store <128 x i16> %res, <128 x i16>* %a
ret void
}
define <2 x i32> @bitreverse_v2i32(<2 x i32> %op) #0 {
define <2 x i32> @bitreverse_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v2i32:
; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
; CHECK-NEXT: rbit z0.s, [[PG]]/m, z0.s
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ptrue p0.s, vl2
; CHECK-NEXT: rbit z0.s, p0/m, z0.s
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %op)
ret <2 x i32> %res
}
define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) #0 {
define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v4i32:
; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
; CHECK-NEXT: rbit z0.s, [[PG]]/m, z0.s
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: rbit z0.s, p0/m, z0.s
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %op)
ret <4 x i32> %res
}
define void @bitreverse_v8i32(<8 x i32>* %a) #0 {
define void @bitreverse_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v8i32:
; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
; CHECK-NEXT: rbit [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: rbit z0.s, p0/m, z0.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op = load <8 x i32>, <8 x i32>* %a
%res = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %op)
store <8 x i32> %res, <8 x i32>* %a
@ -218,80 +232,91 @@ define void @bitreverse_v8i32(<8 x i32>* %a) #0 {
}
define void @bitreverse_v16i32(<16 x i32>* %a) #0 {
; CHECK-LABEL: bitreverse_v16i32:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_512-NEXT: rbit [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation.
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
; VBITS_EQ_256-DAG: rbit [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
; VBITS_EQ_256-DAG: rbit [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
; VBITS_EQ_256-NEXT: ret
; VBITS_GE_256-LABEL: bitreverse_v16i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: rbit z0.s, p0/m, z0.s
; VBITS_GE_256-NEXT: rbit z1.s, p0/m, z1.s
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: bitreverse_v16i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: rbit z0.s, p0/m, z0.s
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op = load <16 x i32>, <16 x i32>* %a
%res = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %op)
store <16 x i32> %res, <16 x i32>* %a
ret void
}
define void @bitreverse_v32i32(<32 x i32>* %a) #0 {
define void @bitreverse_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: bitreverse_v32i32:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_1024-NEXT: rbit [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_1024-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: rbit z0.s, p0/m, z0.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op = load <32 x i32>, <32 x i32>* %a
%res = call <32 x i32> @llvm.bitreverse.v32i32(<32 x i32> %op)
store <32 x i32> %res, <32 x i32>* %a
ret void
}
define void @bitreverse_v64i32(<64 x i32>* %a) #0 {
define void @bitreverse_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: bitreverse_v64i32:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_2048-NEXT: rbit [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_2048-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: rbit z0.s, p0/m, z0.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op = load <64 x i32>, <64 x i32>* %a
%res = call <64 x i32> @llvm.bitreverse.v64i32(<64 x i32> %op)
store <64 x i32> %res, <64 x i32>* %a
ret void
}
define <1 x i64> @bitreverse_v1i64(<1 x i64> %op) #0 {
define <1 x i64> @bitreverse_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v1i64:
; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
; CHECK-NEXT: rbit z0.d, [[PG]]/m, z0.d
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ptrue p0.d, vl1
; CHECK-NEXT: rbit z0.d, p0/m, z0.d
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = call <1 x i64> @llvm.bitreverse.v1i64(<1 x i64> %op)
ret <1 x i64> %res
}
define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) #0 {
define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v2i64:
; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
; CHECK-NEXT: rbit z0.d, [[PG]]/m, z0.d
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: rbit z0.d, p0/m, z0.d
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %op)
ret <2 x i64> %res
}
define void @bitreverse_v4i64(<4 x i64>* %a) #0 {
define void @bitreverse_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v4i64:
; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
; CHECK-NEXT: rbit [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: rbit z0.d, p0/m, z0.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op = load <4 x i64>, <4 x i64>* %a
%res = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %op)
store <4 x i64> %res, <4 x i64>* %a
@ -299,49 +324,53 @@ define void @bitreverse_v4i64(<4 x i64>* %a) #0 {
}
define void @bitreverse_v8i64(<8 x i64>* %a) #0 {
; CHECK-LABEL: bitreverse_v8i64:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_512-NEXT: rbit [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation.
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
; VBITS_EQ_256-DAG: rbit [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
; VBITS_EQ_256-DAG: rbit [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
; VBITS_EQ_256-NEXT: ret
; VBITS_GE_256-LABEL: bitreverse_v8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: rbit z0.d, p0/m, z0.d
; VBITS_GE_256-NEXT: rbit z1.d, p0/m, z1.d
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: bitreverse_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: rbit z0.d, p0/m, z0.d
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op = load <8 x i64>, <8 x i64>* %a
%res = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %op)
store <8 x i64> %res, <8 x i64>* %a
ret void
}
define void @bitreverse_v16i64(<16 x i64>* %a) #0 {
define void @bitreverse_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: bitreverse_v16i64:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_1024-NEXT: rbit [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
; VBITS_GE_1024-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: rbit z0.d, p0/m, z0.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op = load <16 x i64>, <16 x i64>* %a
%res = call <16 x i64> @llvm.bitreverse.v16i64(<16 x i64> %op)
store <16 x i64> %res, <16 x i64>* %a
ret void
}
define void @bitreverse_v32i64(<32 x i64>* %a) #0 {
define void @bitreverse_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: bitreverse_v32i64:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_2048-NEXT: rbit [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
; VBITS_GE_2048-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: rbit z0.d, p0/m, z0.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op = load <32 x i64>, <32 x i64>* %a
%res = call <32 x i64> @llvm.bitreverse.v32i64(<32 x i64> %op)
store <32 x i64> %res, <32 x i64>* %a
@ -353,30 +382,33 @@ define void @bitreverse_v32i64(<32 x i64>* %a) #0 {
;
; Don't use SVE for 64-bit vectors.
define <4 x i16> @bswap_v4i16(<4 x i16> %op) #0 {
define <4 x i16> @bswap_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v4i16:
; CHECK: rev16 v0.8b, v0.8b
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: rev16 v0.8b, v0.8b
; CHECK-NEXT: ret
%res = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %op)
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
define <8 x i16> @bswap_v8i16(<8 x i16> %op) #0 {
define <8 x i16> @bswap_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v8i16:
; CHECK: rev16 v0.16b, v0.16b
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: rev16 v0.16b, v0.16b
; CHECK-NEXT: ret
%res = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %op)
ret <8 x i16> %res
}
define void @bswap_v16i16(<16 x i16>* %a) #0 {
define void @bswap_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v16i16:
; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
; CHECK-NEXT: revb [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: revb z0.h, p0/m, z0.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op = load <16 x i16>, <16 x i16>* %a
%res = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %op)
store <16 x i16> %res, <16 x i16>* %a
@ -384,49 +416,53 @@ define void @bswap_v16i16(<16 x i16>* %a) #0 {
}
define void @bswap_v32i16(<32 x i16>* %a) #0 {
; CHECK-LABEL: bswap_v32i16:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_512-NEXT: revb [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation.
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
; VBITS_EQ_256-DAG: revb [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h
; VBITS_EQ_256-DAG: revb [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h
; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
; VBITS_EQ_256-NEXT: ret
; VBITS_GE_256-LABEL: bswap_v32i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: revb z0.h, p0/m, z0.h
; VBITS_GE_256-NEXT: revb z1.h, p0/m, z1.h
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: bswap_v32i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: revb z0.h, p0/m, z0.h
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op = load <32 x i16>, <32 x i16>* %a
%res = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %op)
store <32 x i16> %res, <32 x i16>* %a
ret void
}
define void @bswap_v64i16(<64 x i16>* %a) #0 {
define void @bswap_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: bswap_v64i16:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_1024-NEXT: revb [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
; VBITS_GE_1024-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: revb z0.h, p0/m, z0.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op = load <64 x i16>, <64 x i16>* %a
%res = call <64 x i16> @llvm.bswap.v64i16(<64 x i16> %op)
store <64 x i16> %res, <64 x i16>* %a
ret void
}
define void @bswap_v128i16(<128 x i16>* %a) #0 {
define void @bswap_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: bswap_v128i16:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_2048-NEXT: revb [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h
; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
; VBITS_GE_2048-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: revb z0.h, p0/m, z0.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op = load <128 x i16>, <128 x i16>* %a
%res = call <128 x i16> @llvm.bswap.v128i16(<128 x i16> %op)
store <128 x i16> %res, <128 x i16>* %a
@ -434,30 +470,33 @@ define void @bswap_v128i16(<128 x i16>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
define <2 x i32> @bswap_v2i32(<2 x i32> %op) #0 {
define <2 x i32> @bswap_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v2i32:
; CHECK: rev32 v0.8b, v0.8b
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: rev32 v0.8b, v0.8b
; CHECK-NEXT: ret
%res = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %op)
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
define <4 x i32> @bswap_v4i32(<4 x i32> %op) #0 {
define <4 x i32> @bswap_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v4i32:
; CHECK: rev32 v0.16b, v0.16b
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: rev32 v0.16b, v0.16b
; CHECK-NEXT: ret
%res = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %op)
ret <4 x i32> %res
}
define void @bswap_v8i32(<8 x i32>* %a) #0 {
define void @bswap_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v8i32:
; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
; CHECK-NEXT: revb [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: revb z0.s, p0/m, z0.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op = load <8 x i32>, <8 x i32>* %a
%res = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %op)
store <8 x i32> %res, <8 x i32>* %a
@ -465,49 +504,53 @@ define void @bswap_v8i32(<8 x i32>* %a) #0 {
}
define void @bswap_v16i32(<16 x i32>* %a) #0 {
; CHECK-LABEL: bswap_v16i32:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_512-NEXT: revb [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation.
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
; VBITS_EQ_256-DAG: revb [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s
; VBITS_EQ_256-DAG: revb [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s
; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
; VBITS_EQ_256-NEXT: ret
; VBITS_GE_256-LABEL: bswap_v16i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: revb z0.s, p0/m, z0.s
; VBITS_GE_256-NEXT: revb z1.s, p0/m, z1.s
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: bswap_v16i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: revb z0.s, p0/m, z0.s
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op = load <16 x i32>, <16 x i32>* %a
%res = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %op)
store <16 x i32> %res, <16 x i32>* %a
ret void
}
define void @bswap_v32i32(<32 x i32>* %a) #0 {
define void @bswap_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: bswap_v32i32:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_1024-NEXT: revb [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_1024-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: revb z0.s, p0/m, z0.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op = load <32 x i32>, <32 x i32>* %a
%res = call <32 x i32> @llvm.bswap.v32i32(<32 x i32> %op)
store <32 x i32> %res, <32 x i32>* %a
ret void
}
define void @bswap_v64i32(<64 x i32>* %a) #0 {
define void @bswap_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: bswap_v64i32:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_2048-NEXT: revb [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s
; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_2048-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: revb z0.s, p0/m, z0.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op = load <64 x i32>, <64 x i32>* %a
%res = call <64 x i32> @llvm.bswap.v64i32(<64 x i32> %op)
store <64 x i32> %res, <64 x i32>* %a
@ -515,30 +558,33 @@ define void @bswap_v64i32(<64 x i32>* %a) #0 {
}
; Don't use SVE for 64-bit vectors.
define <1 x i64> @bswap_v1i64(<1 x i64> %op) #0 {
define <1 x i64> @bswap_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v1i64:
; CHECK: rev64 v0.8b, v0.8b
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: rev64 v0.8b, v0.8b
; CHECK-NEXT: ret
%res = call <1 x i64> @llvm.bswap.v1i64(<1 x i64> %op)
ret <1 x i64> %res
}
; Don't use SVE for 128-bit vectors.
define <2 x i64> @bswap_v2i64(<2 x i64> %op) #0 {
define <2 x i64> @bswap_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v2i64:
; CHECK: rev64 v0.16b, v0.16b
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: rev64 v0.16b, v0.16b
; CHECK-NEXT: ret
%res = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %op)
ret <2 x i64> %res
}
define void @bswap_v4i64(<4 x i64>* %a) #0 {
define void @bswap_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v4i64:
; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
; CHECK-NEXT: revb [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: revb z0.d, p0/m, z0.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op = load <4 x i64>, <4 x i64>* %a
%res = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %op)
store <4 x i64> %res, <4 x i64>* %a
@ -546,49 +592,53 @@ define void @bswap_v4i64(<4 x i64>* %a) #0 {
}
define void @bswap_v8i64(<8 x i64>* %a) #0 {
; CHECK-LABEL: bswap_v8i64:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_512-NEXT: revb [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation.
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
; VBITS_EQ_256-DAG: revb [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d
; VBITS_EQ_256-DAG: revb [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d
; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
; VBITS_EQ_256-NEXT: ret
; VBITS_GE_256-LABEL: bswap_v8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: revb z0.d, p0/m, z0.d
; VBITS_GE_256-NEXT: revb z1.d, p0/m, z1.d
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: bswap_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: revb z0.d, p0/m, z0.d
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op = load <8 x i64>, <8 x i64>* %a
%res = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %op)
store <8 x i64> %res, <8 x i64>* %a
ret void
}
define void @bswap_v16i64(<16 x i64>* %a) #0 {
define void @bswap_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: bswap_v16i64:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_1024-NEXT: revb [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
; VBITS_GE_1024-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: revb z0.d, p0/m, z0.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op = load <16 x i64>, <16 x i64>* %a
%res = call <16 x i64> @llvm.bswap.v16i64(<16 x i64> %op)
store <16 x i64> %res, <16 x i64>* %a
ret void
}
define void @bswap_v32i64(<32 x i64>* %a) #0 {
define void @bswap_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: bswap_v32i64:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_2048-NEXT: revb [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d
; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
; VBITS_GE_2048-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: revb z0.d, p0/m, z0.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op = load <32 x i64>, <32 x i64>* %a
%res = call <32 x i64> @llvm.bswap.v32i64(<32 x i64> %op)
store <32 x i64> %res, <32 x i64>* %a
@ -640,4 +690,3 @@ declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)
declare <8 x i64> @llvm.bswap.v8i64(<8 x i64>)
declare <16 x i64> @llvm.bswap.v16i64(<16 x i64>)
declare <32 x i64> @llvm.bswap.v32i64(<32 x i64>)

View File

@ -1,23 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) #0 {
define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@ -29,7 +17,7 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) #0 {
ret <8 x i8> %res
}
define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) #0 {
define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@ -41,7 +29,7 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) #0 {
ret <16 x i8> %res
}
define void @sdiv_v32i8(<32 x i8>* %a) #0 {
define void @sdiv_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl32
@ -81,91 +69,35 @@ define void @sdiv_v64i8(<64 x i8>* %a) #0 {
ret void
}
define void @sdiv_v128i8(<128 x i8>* %a) #0 {
; VBITS_GE_256-LABEL: sdiv_v128i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov w8, #96
; VBITS_GE_256-NEXT: mov w9, #32
; VBITS_GE_256-NEXT: mov w10, #64
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9]
; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x10]
; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0]
; VBITS_GE_256-NEXT: asrd z1.b, p0/m, z1.b, #5
; VBITS_GE_256-NEXT: asrd z0.b, p0/m, z0.b, #5
; VBITS_GE_256-NEXT: asrd z2.b, p0/m, z2.b, #5
; VBITS_GE_256-NEXT: asrd z3.b, p0/m, z3.b, #5
; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x10]
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x9]
; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: sdiv_v128i8:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.b, vl128
; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_1024-NEXT: asrd z0.b, p0/m, z0.b, #5
; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
define void @sdiv_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: sdiv_v128i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl128
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: asrd z0.b, p0/m, z0.b, #5
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%res = sdiv <128 x i8> %op1, shufflevector (<128 x i8> insertelement (<128 x i8> poison, i8 32, i32 0), <128 x i8> poison, <128 x i32> zeroinitializer)
store <128 x i8> %res, <128 x i8>* %a
ret void
}
define void @sdiv_v256i8(<256 x i8>* %a) #0 {
; VBITS_GE_256-LABEL: sdiv_v256i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov w8, #192
; VBITS_GE_256-NEXT: mov w9, #96
; VBITS_GE_256-NEXT: mov w10, #32
; VBITS_GE_256-NEXT: mov w11, #160
; VBITS_GE_256-NEXT: mov w12, #64
; VBITS_GE_256-NEXT: mov w13, #224
; VBITS_GE_256-NEXT: mov w14, #128
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x9]
; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x10]
; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x11]
; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0, x12]
; VBITS_GE_256-NEXT: ld1b { z4.b }, p0/z, [x0, x13]
; VBITS_GE_256-NEXT: ld1b { z5.b }, p0/z, [x0, x14]
; VBITS_GE_256-NEXT: ld1b { z6.b }, p0/z, [x0, x8]
; VBITS_GE_256-NEXT: ld1b { z7.b }, p0/z, [x0]
; VBITS_GE_256-NEXT: asrd z1.b, p0/m, z1.b, #5
; VBITS_GE_256-NEXT: asrd z0.b, p0/m, z0.b, #5
; VBITS_GE_256-NEXT: asrd z3.b, p0/m, z3.b, #5
; VBITS_GE_256-NEXT: asrd z2.b, p0/m, z2.b, #5
; VBITS_GE_256-NEXT: asrd z5.b, p0/m, z5.b, #5
; VBITS_GE_256-NEXT: asrd z4.b, p0/m, z4.b, #5
; VBITS_GE_256-NEXT: asrd z6.b, p0/m, z6.b, #5
; VBITS_GE_256-NEXT: asrd z7.b, p0/m, z7.b, #5
; VBITS_GE_256-NEXT: st1b { z6.b }, p0, [x0, x8]
; VBITS_GE_256-NEXT: st1b { z4.b }, p0, [x0, x13]
; VBITS_GE_256-NEXT: st1b { z5.b }, p0, [x0, x14]
; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x11]
; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x12]
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x9]
; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x10]
; VBITS_GE_256-NEXT: st1b { z7.b }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: sdiv_v256i8:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.b, vl256
; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_2048-NEXT: asrd z0.b, p0/m, z0.b, #5
; VBITS_GE_2048-NEXT: st1b { z0.b }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
define void @sdiv_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: sdiv_v256i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl256
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: asrd z0.b, p0/m, z0.b, #5
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%res = sdiv <256 x i8> %op1, shufflevector (<256 x i8> insertelement (<256 x i8> poison, i8 32, i32 0), <256 x i8> poison, <256 x i32> zeroinitializer)
store <256 x i8> %res, <256 x i8>* %a
ret void
}
define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) #0 {
define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@ -177,7 +109,7 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) #0 {
ret <4 x i16> %res
}
define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) #0 {
define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@ -189,7 +121,7 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) #0 {
ret <8 x i16> %res
}
define void @sdiv_v16i16(<16 x i16>* %a) #0 {
define void @sdiv_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v16i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -229,91 +161,35 @@ define void @sdiv_v32i16(<32 x i16>* %a) #0 {
ret void
}
define void @sdiv_v64i16(<64 x i16>* %a) #0 {
; VBITS_GE_256-LABEL: sdiv_v64i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #48
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: mov x10, #32
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: asrd z1.h, p0/m, z1.h, #5
; VBITS_GE_256-NEXT: asrd z0.h, p0/m, z0.h, #5
; VBITS_GE_256-NEXT: asrd z2.h, p0/m, z2.h, #5
; VBITS_GE_256-NEXT: asrd z3.h, p0/m, z3.h, #5
; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: sdiv_v64i16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: asrd z0.h, p0/m, z0.h, #5
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
define void @sdiv_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: sdiv_v64i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #5
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%res = sdiv <64 x i16> %op1, shufflevector (<64 x i16> insertelement (<64 x i16> poison, i16 32, i32 0), <64 x i16> poison, <64 x i32> zeroinitializer)
store <64 x i16> %res, <64 x i16>* %a
ret void
}
define void @sdiv_v128i16(<128 x i16>* %a) #0 {
; VBITS_GE_256-LABEL: sdiv_v128i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #96
; VBITS_GE_256-NEXT: mov x9, #48
; VBITS_GE_256-NEXT: mov x10, #16
; VBITS_GE_256-NEXT: mov x11, #80
; VBITS_GE_256-NEXT: mov x12, #32
; VBITS_GE_256-NEXT: mov x13, #112
; VBITS_GE_256-NEXT: mov x14, #64
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x11, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x12, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0, x13, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x0, x14, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: asrd z1.h, p0/m, z1.h, #5
; VBITS_GE_256-NEXT: asrd z0.h, p0/m, z0.h, #5
; VBITS_GE_256-NEXT: asrd z3.h, p0/m, z3.h, #5
; VBITS_GE_256-NEXT: asrd z2.h, p0/m, z2.h, #5
; VBITS_GE_256-NEXT: asrd z5.h, p0/m, z5.h, #5
; VBITS_GE_256-NEXT: asrd z4.h, p0/m, z4.h, #5
; VBITS_GE_256-NEXT: asrd z6.h, p0/m, z6.h, #5
; VBITS_GE_256-NEXT: asrd z7.h, p0/m, z7.h, #5
; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x13, lsl #1]
; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x0, x14, lsl #1]
; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x11, lsl #1]
; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x12, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: sdiv_v128i16:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: asrd z0.h, p0/m, z0.h, #5
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
define void @sdiv_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: sdiv_v128i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #5
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%res = sdiv <128 x i16> %op1, shufflevector (<128 x i16> insertelement (<128 x i16> poison, i16 32, i32 0), <128 x i16> poison, <128 x i32> zeroinitializer)
store <128 x i16> %res, <128 x i16>* %a
ret void
}
define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) #0 {
define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@ -325,7 +201,7 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) #0 {
ret <2 x i32> %res
}
define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) #0 {
define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@ -337,7 +213,7 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) #0 {
ret <4 x i32> %res
}
define void @sdiv_v8i32(<8 x i32>* %a) #0 {
define void @sdiv_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@ -377,91 +253,35 @@ define void @sdiv_v16i32(<16 x i32>* %a) #0 {
ret void
}
define void @sdiv_v32i32(<32 x i32>* %a) #0 {
; VBITS_GE_256-LABEL: sdiv_v32i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #24
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: mov x10, #16
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: asrd z1.s, p0/m, z1.s, #5
; VBITS_GE_256-NEXT: asrd z0.s, p0/m, z0.s, #5
; VBITS_GE_256-NEXT: asrd z2.s, p0/m, z2.s, #5
; VBITS_GE_256-NEXT: asrd z3.s, p0/m, z3.s, #5
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: sdiv_v32i32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: asrd z0.s, p0/m, z0.s, #5
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
define void @sdiv_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: sdiv_v32i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #5
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%res = sdiv <32 x i32> %op1, shufflevector (<32 x i32> insertelement (<32 x i32> poison, i32 32, i32 0), <32 x i32> poison, <32 x i32> zeroinitializer)
store <32 x i32> %res, <32 x i32>* %a
ret void
}
define void @sdiv_v64i32(<64 x i32>* %a) #0 {
; VBITS_GE_256-LABEL: sdiv_v64i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #48
; VBITS_GE_256-NEXT: mov x9, #24
; VBITS_GE_256-NEXT: mov x10, #8
; VBITS_GE_256-NEXT: mov x11, #40
; VBITS_GE_256-NEXT: mov x12, #16
; VBITS_GE_256-NEXT: mov x13, #56
; VBITS_GE_256-NEXT: mov x14, #32
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x13, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x14, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: asrd z1.s, p0/m, z1.s, #5
; VBITS_GE_256-NEXT: asrd z0.s, p0/m, z0.s, #5
; VBITS_GE_256-NEXT: asrd z3.s, p0/m, z3.s, #5
; VBITS_GE_256-NEXT: asrd z2.s, p0/m, z2.s, #5
; VBITS_GE_256-NEXT: asrd z5.s, p0/m, z5.s, #5
; VBITS_GE_256-NEXT: asrd z4.s, p0/m, z4.s, #5
; VBITS_GE_256-NEXT: asrd z6.s, p0/m, z6.s, #5
; VBITS_GE_256-NEXT: asrd z7.s, p0/m, z7.s, #5
; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x13, lsl #2]
; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x0, x14, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: sdiv_v64i32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: asrd z0.s, p0/m, z0.s, #5
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
define void @sdiv_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: sdiv_v64i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #5
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%res = sdiv <64 x i32> %op1, shufflevector (<64 x i32> insertelement (<64 x i32> poison, i32 32, i32 0), <64 x i32> poison, <64 x i32> zeroinitializer)
store <64 x i32> %res, <64 x i32>* %a
ret void
}
define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) #0 {
define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v1i64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@ -474,7 +294,7 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) #0 {
}
; Vector i64 sdiv are not legal for NEON so use SVE when available.
define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) #0 {
define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@ -486,7 +306,7 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) #0 {
ret <2 x i64> %res
}
define void @sdiv_v4i64(<4 x i64>* %a) #0 {
define void @sdiv_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@ -526,84 +346,28 @@ define void @sdiv_v8i64(<8 x i64>* %a) #0 {
ret void
}
define void @sdiv_v16i64(<16 x i64>* %a) #0 {
; VBITS_GE_256-LABEL: sdiv_v16i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #12
; VBITS_GE_256-NEXT: mov x9, #4
; VBITS_GE_256-NEXT: mov x10, #8
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: asrd z1.d, p0/m, z1.d, #5
; VBITS_GE_256-NEXT: asrd z0.d, p0/m, z0.d, #5
; VBITS_GE_256-NEXT: asrd z2.d, p0/m, z2.d, #5
; VBITS_GE_256-NEXT: asrd z3.d, p0/m, z3.d, #5
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: sdiv_v16i64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: asrd z0.d, p0/m, z0.d, #5
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
define void @sdiv_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: sdiv_v16i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #5
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%res = sdiv <16 x i64> %op1, shufflevector (<16 x i64> insertelement (<16 x i64> poison, i64 32, i32 0), <16 x i64> poison, <16 x i32> zeroinitializer)
store <16 x i64> %res, <16 x i64>* %a
ret void
}
define void @sdiv_v32i64(<32 x i64>* %a) #0 {
; VBITS_GE_256-LABEL: sdiv_v32i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #24
; VBITS_GE_256-NEXT: mov x9, #12
; VBITS_GE_256-NEXT: mov x10, #4
; VBITS_GE_256-NEXT: mov x11, #20
; VBITS_GE_256-NEXT: mov x12, #8
; VBITS_GE_256-NEXT: mov x13, #28
; VBITS_GE_256-NEXT: mov x14, #16
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x12, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x13, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: asrd z1.d, p0/m, z1.d, #5
; VBITS_GE_256-NEXT: asrd z0.d, p0/m, z0.d, #5
; VBITS_GE_256-NEXT: asrd z3.d, p0/m, z3.d, #5
; VBITS_GE_256-NEXT: asrd z2.d, p0/m, z2.d, #5
; VBITS_GE_256-NEXT: asrd z5.d, p0/m, z5.d, #5
; VBITS_GE_256-NEXT: asrd z4.d, p0/m, z4.d, #5
; VBITS_GE_256-NEXT: asrd z6.d, p0/m, z6.d, #5
; VBITS_GE_256-NEXT: asrd z7.d, p0/m, z7.d, #5
; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x13, lsl #3]
; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x14, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x12, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: sdiv_v32i64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: asrd z0.d, p0/m, z0.d, #5
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
define void @sdiv_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: sdiv_v32i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #5
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%res = sdiv <32 x i64> %op1, shufflevector (<32 x i64> insertelement (<32 x i64> poison, i64 32, i32 0), <32 x i64> poison, <32 x i32> zeroinitializer)
store <32 x i64> %res, <32 x i64>* %a

View File

@ -7,7 +7,7 @@ target triple = "aarch64-unknown-linux-gnu"
; bigger than NEON. However, having no support opens us up to a code generator
; hang when expanding BUILD_VECTOR. Here we just validate the promblematic case
; successfully exits code generation.
define void @hang_when_merging_stores_after_legalisation(<8 x i32>* %a, <2 x i32> %b) #0 {
define void @hang_when_merging_stores_after_legalisation(<8 x i32>* %a, <2 x i32> %b) vscale_range(2,2) #0 {
; CHECK-LABEL: hang_when_merging_stores_after_legalisation:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
@ -37,8 +37,8 @@ define void @hang_when_merging_stores_after_legalisation(<8 x i32>* %a, <2 x i32
ret void
}
; Ensure we don't crash when trying to lower a shuffle via and extract
define void @crash_when_lowering_extract_shuffle(<32 x i32>* %dst, i1 %cond) #0 {
; Ensure we don't crash when trying to lower a shuffle via an extract
define void @crash_when_lowering_extract_shuffle(<32 x i32>* %dst, i1 %cond) vscale_range(2,2) #0 {
; CHECK-LABEL: crash_when_lowering_extract_shuffle:
; CHECK: // %bb.0:
; CHECK-NEXT: tbnz w1, #0, .LBB1_2
@ -132,4 +132,4 @@ exit:
ret void
}
attributes #0 = { vscale_range(2,2) "target-features"="+sve" }
attributes #0 = { "target-features"="+sve" }

View File

@ -1,21 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; Don't use SVE when its registers are no bigger than NEON.
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
@ -24,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
;
; Don't use SVE for 64-bit vectors.
define <8 x i8> @splat_v8i8(i8 %a) #0 {
define <8 x i8> @splat_v8i8(i8 %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.8b, w0
@ -35,7 +21,7 @@ define <8 x i8> @splat_v8i8(i8 %a) #0 {
}
; Don't use SVE for 128-bit vectors.
define <16 x i8> @splat_v16i8(i8 %a) #0 {
define <16 x i8> @splat_v16i8(i8 %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.16b, w0
@ -45,7 +31,7 @@ define <16 x i8> @splat_v16i8(i8 %a) #0 {
ret <16 x i8> %splat
}
define void @splat_v32i8(i8 %a, <32 x i8>* %b) #0 {
define void @splat_v32i8(i8 %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl32
@ -74,68 +60,32 @@ define void @splat_v64i8(i8 %a, <64 x i8>* %b) #0 {
; VBITS_GE_512-NEXT: mov z0.b, w0
; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation.
%insert = insertelement <64 x i8> undef, i8 %a, i64 0
%splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
store <64 x i8> %splat, <64 x i8>* %b
ret void
}
define void @splat_v128i8(i8 %a, <128 x i8>* %b) #0 {
; VBITS_GE_256-LABEL: splat_v128i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov w8, #96
; VBITS_GE_256-NEXT: mov w9, #64
; VBITS_GE_256-NEXT: mov w10, #32
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: mov z0.b, w0
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x8]
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x9]
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x10]
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: splat_v128i8:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.b, vl128
; VBITS_GE_1024-NEXT: mov z0.b, w0
; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @splat_v128i8(i8 %a, <128 x i8>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: splat_v128i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl128
; CHECK-NEXT: mov z0.b, w0
; CHECK-NEXT: st1b { z0.b }, p0, [x1]
; CHECK-NEXT: ret
%insert = insertelement <128 x i8> undef, i8 %a, i64 0
%splat = shufflevector <128 x i8> %insert, <128 x i8> undef, <128 x i32> zeroinitializer
store <128 x i8> %splat, <128 x i8>* %b
ret void
}
define void @splat_v256i8(i8 %a, <256 x i8>* %b) #0 {
; VBITS_GE_256-LABEL: splat_v256i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov w8, #224
; VBITS_GE_256-NEXT: mov w9, #192
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: mov z0.b, w0
; VBITS_GE_256-NEXT: mov w10, #160
; VBITS_GE_256-NEXT: mov w11, #128
; VBITS_GE_256-NEXT: mov w12, #96
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x8]
; VBITS_GE_256-NEXT: mov w8, #64
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x9]
; VBITS_GE_256-NEXT: mov w9, #32
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x10]
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x11]
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x12]
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x8]
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x9]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: splat_v256i8:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.b, vl256
; VBITS_GE_2048-NEXT: mov z0.b, w0
; VBITS_GE_2048-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
define void @splat_v256i8(i8 %a, <256 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: splat_v256i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl256
; CHECK-NEXT: mov z0.b, w0
; CHECK-NEXT: st1b { z0.b }, p0, [x1]
; CHECK-NEXT: ret
%insert = insertelement <256 x i8> undef, i8 %a, i64 0
%splat = shufflevector <256 x i8> %insert, <256 x i8> undef, <256 x i32> zeroinitializer
store <256 x i8> %splat, <256 x i8>* %b
@ -143,7 +93,7 @@ define void @splat_v256i8(i8 %a, <256 x i8>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
define <4 x i16> @splat_v4i16(i16 %a) #0 {
define <4 x i16> @splat_v4i16(i16 %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.4h, w0
@ -154,7 +104,7 @@ define <4 x i16> @splat_v4i16(i16 %a) #0 {
}
; Don't use SVE for 128-bit vectors.
define <8 x i16> @splat_v8i16(i16 %a) #0 {
define <8 x i16> @splat_v8i16(i16 %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.8h, w0
@ -164,7 +114,7 @@ define <8 x i16> @splat_v8i16(i16 %a) #0 {
ret <8 x i16> %splat
}
define void @splat_v16i16(i16 %a, <16 x i16>* %b) #0 {
define void @splat_v16i16(i16 %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v16i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -193,68 +143,32 @@ define void @splat_v32i16(i16 %a, <32 x i16>* %b) #0 {
; VBITS_GE_512-NEXT: mov z0.h, w0
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation.
%insert = insertelement <32 x i16> undef, i16 %a, i64 0
%splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
store <32 x i16> %splat, <32 x i16>* %b
ret void
}
define void @splat_v64i16(i16 %a, <64 x i16>* %b) #0 {
; VBITS_GE_256-LABEL: splat_v64i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #48
; VBITS_GE_256-NEXT: mov x9, #32
; VBITS_GE_256-NEXT: mov x10, #16
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: mov z0.h, w0
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: splat_v64i16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: mov z0.h, w0
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @splat_v64i16(i16 %a, <64 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: splat_v64i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: mov z0.h, w0
; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; CHECK-NEXT: ret
%insert = insertelement <64 x i16> undef, i16 %a, i64 0
%splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer
store <64 x i16> %splat, <64 x i16>* %b
ret void
}
define void @splat_v128i16(i16 %a, <128 x i16>* %b) #0 {
; VBITS_GE_256-LABEL: splat_v128i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #112
; VBITS_GE_256-NEXT: mov x9, #96
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: mov z0.h, w0
; VBITS_GE_256-NEXT: mov x10, #80
; VBITS_GE_256-NEXT: mov x11, #64
; VBITS_GE_256-NEXT: mov x12, #48
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: mov x8, #32
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x11, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x12, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: splat_v128i16:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048-NEXT: mov z0.h, w0
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
define void @splat_v128i16(i16 %a, <128 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: splat_v128i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: mov z0.h, w0
; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; CHECK-NEXT: ret
%insert = insertelement <128 x i16> undef, i16 %a, i64 0
%splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer
store <128 x i16> %splat, <128 x i16>* %b
@ -262,7 +176,7 @@ define void @splat_v128i16(i16 %a, <128 x i16>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
define <2 x i32> @splat_v2i32(i32 %a) #0 {
define <2 x i32> @splat_v2i32(i32 %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.2s, w0
@ -273,7 +187,7 @@ define <2 x i32> @splat_v2i32(i32 %a) #0 {
}
; Don't use SVE for 128-bit vectors.
define <4 x i32> @splat_v4i32(i32 %a) #0 {
define <4 x i32> @splat_v4i32(i32 %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.4s, w0
@ -283,7 +197,7 @@ define <4 x i32> @splat_v4i32(i32 %a) #0 {
ret <4 x i32> %splat
}
define void @splat_v8i32(i32 %a, <8 x i32>* %b) #0 {
define void @splat_v8i32(i32 %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@ -312,68 +226,32 @@ define void @splat_v16i32(i32 %a, <16 x i32>* %b) #0 {
; VBITS_GE_512-NEXT: mov z0.s, w0
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation.
%insert = insertelement <16 x i32> undef, i32 %a, i64 0
%splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
store <16 x i32> %splat, <16 x i32>* %b
ret void
}
define void @splat_v32i32(i32 %a, <32 x i32>* %b) #0 {
; VBITS_GE_256-LABEL: splat_v32i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #24
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: mov x10, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: mov z0.s, w0
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: splat_v32i32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: mov z0.s, w0
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @splat_v32i32(i32 %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: splat_v32i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: mov z0.s, w0
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%insert = insertelement <32 x i32> undef, i32 %a, i64 0
%splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer
store <32 x i32> %splat, <32 x i32>* %b
ret void
}
define void @splat_v64i32(i32 %a, <64 x i32>* %b) #0 {
; VBITS_GE_256-LABEL: splat_v64i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #56
; VBITS_GE_256-NEXT: mov x9, #48
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: mov z0.s, w0
; VBITS_GE_256-NEXT: mov x10, #40
; VBITS_GE_256-NEXT: mov x11, #32
; VBITS_GE_256-NEXT: mov x12, #24
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: splat_v64i32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: mov z0.s, w0
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
define void @splat_v64i32(i32 %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: splat_v64i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: mov z0.s, w0
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%insert = insertelement <64 x i32> undef, i32 %a, i64 0
%splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer
store <64 x i32> %splat, <64 x i32>* %b
@ -381,7 +259,7 @@ define void @splat_v64i32(i32 %a, <64 x i32>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
define <1 x i64> @splat_v1i64(i64 %a) #0 {
define <1 x i64> @splat_v1i64(i64 %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v1i64:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov d0, x0
@ -392,7 +270,7 @@ define <1 x i64> @splat_v1i64(i64 %a) #0 {
}
; Don't use SVE for 128-bit vectors.
define <2 x i64> @splat_v2i64(i64 %a) #0 {
define <2 x i64> @splat_v2i64(i64 %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.2d, x0
@ -402,7 +280,7 @@ define <2 x i64> @splat_v2i64(i64 %a) #0 {
ret <2 x i64> %splat
}
define void @splat_v4i64(i64 %a, <4 x i64>* %b) #0 {
define void @splat_v4i64(i64 %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
@ -431,68 +309,32 @@ define void @splat_v8i64(i64 %a, <8 x i64>* %b) #0 {
; VBITS_GE_512-NEXT: mov z0.d, x0
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation.
%insert = insertelement <8 x i64> undef, i64 %a, i64 0
%splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
store <8 x i64> %splat, <8 x i64>* %b
ret void
}
define void @splat_v16i64(i64 %a, <16 x i64>* %b) #0 {
; VBITS_GE_256-LABEL: splat_v16i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #12
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: mov x10, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: mov z0.d, x0
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: splat_v16i64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: mov z0.d, x0
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @splat_v16i64(i64 %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: splat_v16i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: mov z0.d, x0
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%insert = insertelement <16 x i64> undef, i64 %a, i64 0
%splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer
store <16 x i64> %splat, <16 x i64>* %b
ret void
}
define void @splat_v32i64(i64 %a, <32 x i64>* %b) #0 {
; VBITS_GE_256-LABEL: splat_v32i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #28
; VBITS_GE_256-NEXT: mov x9, #24
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: mov z0.d, x0
; VBITS_GE_256-NEXT: mov x10, #20
; VBITS_GE_256-NEXT: mov x11, #16
; VBITS_GE_256-NEXT: mov x12, #12
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x9, #4
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x12, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: splat_v32i64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: mov z0.d, x0
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
define void @splat_v32i64(i64 %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: splat_v32i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: mov z0.d, x0
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%insert = insertelement <32 x i64> undef, i64 %a, i64 0
%splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer
store <32 x i64> %splat, <32 x i64>* %b
@ -504,7 +346,7 @@ define void @splat_v32i64(i64 %a, <32 x i64>* %b) #0 {
;
; Don't use SVE for 64-bit vectors.
define <4 x half> @splat_v4f16(half %a) #0 {
define <4 x half> @splat_v4f16(half %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0
@ -516,7 +358,7 @@ define <4 x half> @splat_v4f16(half %a) #0 {
}
; Don't use SVE for 128-bit vectors.
define <8 x half> @splat_v8f16(half %a) #0 {
define <8 x half> @splat_v8f16(half %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0
@ -527,7 +369,7 @@ define <8 x half> @splat_v8f16(half %a) #0 {
ret <8 x half> %splat
}
define void @splat_v16f16(half %a, <16 x half>* %b) #0 {
define void @splat_v16f16(half %a, <16 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
@ -559,72 +401,34 @@ define void @splat_v32f16(half %a, <32 x half>* %b) #0 {
; VBITS_GE_512-NEXT: mov z0.h, h0
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation.
%insert = insertelement <32 x half> undef, half %a, i64 0
%splat = shufflevector <32 x half> %insert, <32 x half> undef, <32 x i32> zeroinitializer
store <32 x half> %splat, <32 x half>* %b
ret void
}
define void @splat_v64f16(half %a, <64 x half>* %b) #0 {
; VBITS_GE_256-LABEL: splat_v64f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #48
; VBITS_GE_256-NEXT: mov x9, #32
; VBITS_GE_256-NEXT: mov x10, #16
; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 def $z0
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: mov z0.h, h0
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: splat_v64f16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: // kill: def $h0 killed $h0 def $z0
; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: mov z0.h, h0
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
define void @splat_v64f16(half %a, <64 x half>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: splat_v64f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: mov z0.h, h0
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%insert = insertelement <64 x half> undef, half %a, i64 0
%splat = shufflevector <64 x half> %insert, <64 x half> undef, <64 x i32> zeroinitializer
store <64 x half> %splat, <64 x half>* %b
ret void
}
define void @splat_v128f16(half %a, <128 x half>* %b) #0 {
; VBITS_GE_256-LABEL: splat_v128f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #112
; VBITS_GE_256-NEXT: mov x9, #96
; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 def $z0
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: mov x10, #80
; VBITS_GE_256-NEXT: mov z0.h, h0
; VBITS_GE_256-NEXT: mov x11, #64
; VBITS_GE_256-NEXT: mov x12, #48
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: mov x8, #32
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x11, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x12, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: splat_v128f16:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: // kill: def $h0 killed $h0 def $z0
; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048-NEXT: mov z0.h, h0
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
define void @splat_v128f16(half %a, <128 x half>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: splat_v128f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: mov z0.h, h0
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%insert = insertelement <128 x half> undef, half %a, i64 0
%splat = shufflevector <128 x half> %insert, <128 x half> undef, <128 x i32> zeroinitializer
store <128 x half> %splat, <128 x half>* %b
@ -632,7 +436,7 @@ define void @splat_v128f16(half %a, <128 x half>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) #0 {
define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
@ -644,7 +448,7 @@ define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) #0 {
}
; Don't use SVE for 128-bit vectors.
define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) #0 {
define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
@ -655,7 +459,7 @@ define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) #0 {
ret <4 x float> %splat
}
define void @splat_v8f32(float %a, <8 x float>* %b) #0 {
define void @splat_v8f32(float %a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
@ -687,72 +491,34 @@ define void @splat_v16f32(float %a, <16 x float>* %b) #0 {
; VBITS_GE_512-NEXT: mov z0.s, s0
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation.
%insert = insertelement <16 x float> undef, float %a, i64 0
%splat = shufflevector <16 x float> %insert, <16 x float> undef, <16 x i32> zeroinitializer
store <16 x float> %splat, <16 x float>* %b
ret void
}
define void @splat_v32f32(float %a, <32 x float>* %b) #0 {
; VBITS_GE_256-LABEL: splat_v32f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #24
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: mov x10, #8
; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 def $z0
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: mov z0.s, s0
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: splat_v32f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: // kill: def $s0 killed $s0 def $z0
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: mov z0.s, s0
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
define void @splat_v32f32(float %a, <32 x float>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: splat_v32f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: mov z0.s, s0
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%insert = insertelement <32 x float> undef, float %a, i64 0
%splat = shufflevector <32 x float> %insert, <32 x float> undef, <32 x i32> zeroinitializer
store <32 x float> %splat, <32 x float>* %b
ret void
}
define void @splat_v64f32(float %a, <64 x float>* %b) #0 {
; VBITS_GE_256-LABEL: splat_v64f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #56
; VBITS_GE_256-NEXT: mov x9, #48
; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 def $z0
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: mov x10, #40
; VBITS_GE_256-NEXT: mov z0.s, s0
; VBITS_GE_256-NEXT: mov x11, #32
; VBITS_GE_256-NEXT: mov x12, #24
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: splat_v64f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: // kill: def $s0 killed $s0 def $z0
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: mov z0.s, s0
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
define void @splat_v64f32(float %a, <64 x float>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: splat_v64f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: mov z0.s, s0
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%insert = insertelement <64 x float> undef, float %a, i64 0
%splat = shufflevector <64 x float> %insert, <64 x float> undef, <64 x i32> zeroinitializer
store <64 x float> %splat, <64 x float>* %b
@ -760,7 +526,7 @@ define void @splat_v64f32(float %a, <64 x float>* %b) #0 {
}
; Don't use SVE for 64-bit vectors.
define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) #0 {
define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ret
@ -770,7 +536,7 @@ define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) #0 {
}
; Don't use SVE for 128-bit vectors.
define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) #0 {
define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@ -781,7 +547,7 @@ define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) #0 {
ret <2 x double> %splat
}
define void @splat_v4f64(double %a, <4 x double>* %b) #0 {
define void @splat_v4f64(double %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@ -813,72 +579,34 @@ define void @splat_v8f64(double %a, <8 x double>* %b) #0 {
; VBITS_GE_512-NEXT: mov z0.d, d0
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation.
%insert = insertelement <8 x double> undef, double %a, i64 0
%splat = shufflevector <8 x double> %insert, <8 x double> undef, <8 x i32> zeroinitializer
store <8 x double> %splat, <8 x double>* %b
ret void
}
define void @splat_v16f64(double %a, <16 x double>* %b) #0 {
; VBITS_GE_256-LABEL: splat_v16f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #12
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: mov x10, #4
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: mov z0.d, d0
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: splat_v16f64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: // kill: def $d0 killed $d0 def $z0
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: mov z0.d, d0
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
define void @splat_v16f64(double %a, <16 x double>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: splat_v16f64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: mov z0.d, d0
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%insert = insertelement <16 x double> undef, double %a, i64 0
%splat = shufflevector <16 x double> %insert, <16 x double> undef, <16 x i32> zeroinitializer
store <16 x double> %splat, <16 x double>* %b
ret void
}
define void @splat_v32f64(double %a, <32 x double>* %b) #0 {
; VBITS_GE_256-LABEL: splat_v32f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #28
; VBITS_GE_256-NEXT: mov x9, #24
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: mov x10, #20
; VBITS_GE_256-NEXT: mov z0.d, d0
; VBITS_GE_256-NEXT: mov x11, #16
; VBITS_GE_256-NEXT: mov x12, #12
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x9, #4
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x12, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: splat_v32f64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: // kill: def $d0 killed $d0 def $z0
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: mov z0.d, d0
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
define void @splat_v32f64(double %a, <32 x double>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: splat_v32f64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: mov z0.d, d0
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%insert = insertelement <32 x double> undef, double %a, i64 0
%splat = shufflevector <32 x double> %insert, <32 x double> undef, <32 x i32> zeroinitializer
store <32 x double> %splat, <32 x double>* %b
@ -889,88 +617,52 @@ define void @splat_v32f64(double %a, <32 x double>* %b) #0 {
; DUP (integer immediate)
;
define void @splat_imm_v64i8(<64 x i8>* %a) #0 {
; VBITS_GE_256-LABEL: splat_imm_v64i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov w8, #32
; VBITS_GE_256-NEXT: mov z0.b, #1 // =0x1
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0]
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: splat_imm_v64i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov z0.b, #1 // =0x1
; VBITS_GE_512-NEXT: ptrue p0.b, vl64
; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
; VBITS_GE_512-NEXT: ret
define void @splat_imm_v64i8(<64 x i8>* %a) vscale_range(4,0) #0 {
; CHECK-LABEL: splat_imm_v64i8:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z0.b, #1 // =0x1
; CHECK-NEXT: ptrue p0.b, vl64
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%insert = insertelement <64 x i8> undef, i8 1, i64 0
%splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
store <64 x i8> %splat, <64 x i8>* %a
ret void
}
define void @splat_imm_v32i16(<32 x i16>* %a) #0 {
; VBITS_GE_256-LABEL: splat_imm_v32i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: mov z0.h, #2 // =0x2
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: splat_imm_v32i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov z0.h, #2 // =0x2
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
define void @splat_imm_v32i16(<32 x i16>* %a) vscale_range(4,0) #0 {
; CHECK-LABEL: splat_imm_v32i16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z0.h, #2 // =0x2
; CHECK-NEXT: ptrue p0.h, vl32
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%insert = insertelement <32 x i16> undef, i16 2, i64 0
%splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
store <32 x i16> %splat, <32 x i16>* %a
ret void
}
define void @splat_imm_v16i32(<16 x i32>* %a) #0 {
; VBITS_GE_256-LABEL: splat_imm_v16i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: mov z0.s, #3 // =0x3
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: splat_imm_v16i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov z0.s, #3 // =0x3
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
define void @splat_imm_v16i32(<16 x i32>* %a) vscale_range(4,0) #0 {
; CHECK-LABEL: splat_imm_v16i32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z0.s, #3 // =0x3
; CHECK-NEXT: ptrue p0.s, vl16
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%insert = insertelement <16 x i32> undef, i32 3, i64 0
%splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
store <16 x i32> %splat, <16 x i32>* %a
ret void
}
define void @splat_imm_v8i64(<8 x i64>* %a) #0 {
; VBITS_GE_256-LABEL: splat_imm_v8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: mov z0.d, #4 // =0x4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: splat_imm_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov z0.d, #4 // =0x4
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
define void @splat_imm_v8i64(<8 x i64>* %a) vscale_range(4,0) #0 {
; CHECK-LABEL: splat_imm_v8i64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z0.d, #4 // =0x4
; CHECK-NEXT: ptrue p0.d, vl8
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%insert = insertelement <8 x i64> undef, i64 4, i64 0
%splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
store <8 x i64> %splat, <8 x i64>* %a
@ -981,69 +673,43 @@ define void @splat_imm_v8i64(<8 x i64>* %a) #0 {
; DUP (floating-point immediate)
;
define void @splat_imm_v32f16(<32 x half>* %a) #0 {
; VBITS_GE_256-LABEL: splat_imm_v32f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: fmov z0.h, #5.00000000
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: splat_imm_v32f16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: fmov z0.h, #5.00000000
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
define void @splat_imm_v32f16(<32 x half>* %a) vscale_range(4,0) #0 {
; CHECK-LABEL: splat_imm_v32f16:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov z0.h, #5.00000000
; CHECK-NEXT: ptrue p0.h, vl32
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%insert = insertelement <32 x half> undef, half 5.0, i64 0
%splat = shufflevector <32 x half> %insert, <32 x half> undef, <32 x i32> zeroinitializer
store <32 x half> %splat, <32 x half>* %a
ret void
}
define void @splat_imm_v16f32(<16 x float>* %a) #0 {
; VBITS_GE_256-LABEL: splat_imm_v16f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: fmov z0.s, #6.00000000
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: splat_imm_v16f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: fmov z0.s, #6.00000000
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
define void @splat_imm_v16f32(<16 x float>* %a) vscale_range(4,0) #0 {
; CHECK-LABEL: splat_imm_v16f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov z0.s, #6.00000000
; CHECK-NEXT: ptrue p0.s, vl16
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%insert = insertelement <16 x float> undef, float 6.0, i64 0
%splat = shufflevector <16 x float> %insert, <16 x float> undef, <16 x i32> zeroinitializer
store <16 x float> %splat, <16 x float>* %a
ret void
}
define void @splat_imm_v8f64(<8 x double>* %a) #0 {
; VBITS_GE_256-LABEL: splat_imm_v8f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: fmov z0.d, #7.00000000
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: splat_imm_v8f64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: fmov z0.d, #7.00000000
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
define void @splat_imm_v8f64(<8 x double>* %a) vscale_range(4,0) #0 {
; CHECK-LABEL: splat_imm_v8f64:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov z0.d, #7.00000000
; CHECK-NEXT: ptrue p0.d, vl8
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%insert = insertelement <8 x double> undef, double 7.0, i64 0
%splat = shufflevector <8 x double> %insert, <8 x double> undef, <8 x i32> zeroinitializer
store <8 x double> %splat, <8 x double>* %a
ret void
}
attributes #0 = { "target-features"="+sve" }

View File

@ -1,35 +1,29 @@
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK
; VBYTES represents the useful byte size of a vector register from the code
; generator's point of view. It is clamped to power-of-2 values because
; only power-of-2 vector lengths are considered legal, regardless of the
; user specified vector length.
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048
target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE when its registers are no bigger than NEON.
; NO_SVE-NOT: ptrue
; Don't use SVE for 64-bit vectors.
define void @store_v2f32(<2 x float>* %a) #0 {
; CHECK-LABEL: store_v2f32:
; CHECK: str xzr, [x0]
; CHECK: ret
; CHECK: // %bb.0:
; CHECK-NEXT: str xzr, [x0]
; CHECK-NEXT: ret
store <2 x float> zeroinitializer, <2 x float>* %a
ret void
}
@ -37,66 +31,148 @@ define void @store_v2f32(<2 x float>* %a) #0 {
; Don't use SVE for 128-bit vectors.
define void @store_v4f32(<4 x float>* %a) #0 {
; CHECK-LABEL: store_v4f32:
; CHECK: stp xzr, xzr, [x0]
; CHECK: ret
; CHECK: // %bb.0:
; CHECK-NEXT: stp xzr, xzr, [x0]
; CHECK-NEXT: ret
store <4 x float> zeroinitializer, <4 x float>* %a
ret void
}
define void @store_v8f32(<8 x float>* %a) #0 {
; CHECK-LABEL: store_v8f32:
; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
; CHECK: st1w { z{{[0-9]+}}.s }, [[PG]], [x0]
; CHECK: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: mov z0.s, #0 // =0x0
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
store <8 x float> zeroinitializer, <8 x float>* %a
ret void
}
define void @store_v16f32(<16 x float>* %a) #0 {
; CHECK-LABEL: store_v16f32:
; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
; CHECK-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0]
; VBITS_LE_256-DAG: mov x[[A1:[0-9]+]], #[[#div(VBYTES,4)]]
; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A1]], lsl #2]
; CHECK: ret
; VBITS_GE_256-LABEL: store_v16f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: mov z0.s, #0 // =0x0
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: store_v16f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: mov z0.s, #0 // =0x0
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
;
; VBITS_GE_1024-LABEL: store_v16f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
; VBITS_GE_1024-NEXT: mov z0.s, #0 // =0x0
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: store_v16f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl16
; VBITS_GE_2048-NEXT: mov z0.s, #0 // =0x0
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
store <16 x float> zeroinitializer, <16 x float>* %a
ret void
}
define void @store_v32f32(<32 x float>* %a) #0 {
; CHECK-LABEL: store_v32f32:
; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
; CHECK-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0]
; VBITS_LE_512-DAG: mov x[[A1:[0-9]+]], #[[#div(VBYTES,4)]]
; VBITS_LE_512-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A1]], lsl #2]
; VBITS_LE_256-DAG: mov x[[A2:[0-9]+]], #[[#mul(div(VBYTES,4),2)]]
; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A2]], lsl #2]
; VBITS_LE_256-DAG: mov x[[A3:[0-9]+]], #[[#mul(div(VBYTES,4),3)]]
; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A3]], lsl #2]
; CHECK: ret
; VBITS_GE_256-LABEL: store_v32f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #24
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: mov x10, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: mov z0.s, #0 // =0x0
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: store_v32f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov x8, #16
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: mov z0.s, #0 // =0x0
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_512-NEXT: ret
;
; VBITS_GE_1024-LABEL: store_v32f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: mov z0.s, #0 // =0x0
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: store_v32f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
; VBITS_GE_2048-NEXT: mov z0.s, #0 // =0x0
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
store <32 x float> zeroinitializer, <32 x float>* %a
ret void
}
define void @store_v64f32(<64 x float>* %a) #0 {
; CHECK-LABEL: store_v64f32:
; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
; CHECK-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0]
; VBITS_LE_1024-DAG: mov x[[A1:[0-9]+]], #[[#div(VBYTES,4)]]
; VBITS_LE_1024-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A1]], lsl #2]
; VBITS_LE_512-DAG: mov x[[A2:[0-9]+]], #[[#mul(div(VBYTES,4),2)]]
; VBITS_LE_512-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A2]], lsl #2]
; VBITS_LE_512-DAG: mov x[[A3:[0-9]+]], #[[#mul(div(VBYTES,4),3)]]
; VBITS_LE_512-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A3]], lsl #2]
; VBITS_LE_256-DAG: mov x[[A4:[0-9]+]], #[[#mul(div(VBYTES,4),4)]]
; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A4]], lsl #2]
; VBITS_LE_256-DAG: mov x[[A5:[0-9]+]], #[[#mul(div(VBYTES,4),5)]]
; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A5]], lsl #2]
; VBITS_LE_256-DAG: mov x[[A6:[0-9]+]], #[[#mul(div(VBYTES,4),6)]]
; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A6]], lsl #2]
; VBITS_LE_256-DAG: mov x[[A7:[0-9]+]], #[[#mul(div(VBYTES,4),7)]]
; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A7]], lsl #2]
; CHECK: ret
; VBITS_GE_256-LABEL: store_v64f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #56
; VBITS_GE_256-NEXT: mov x9, #48
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: mov z0.s, #0 // =0x0
; VBITS_GE_256-NEXT: mov x10, #40
; VBITS_GE_256-NEXT: mov x11, #32
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: mov x8, #24
; VBITS_GE_256-NEXT: mov x12, #16
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: store_v64f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov x8, #48
; VBITS_GE_512-NEXT: mov x9, #32
; VBITS_GE_512-NEXT: mov x10, #16
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: mov z0.s, #0 // =0x0
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2]
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
;
; VBITS_GE_1024-LABEL: store_v64f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: mov x8, #32
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: mov z0.s, #0 // =0x0
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: store_v64f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: mov z0.s, #0 // =0x0
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
store <64 x float> zeroinitializer, <64 x float>* %a
ret void
}

View File

@ -1,21 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; Don't use SVE when its registers are no bigger than NEON.
; RUN: llc -aarch64-sve-vector-bits-min=128 -aarch64-enable-atomic-cfg-tidy=false < %s | not grep ptrue
; RUN: llc -aarch64-sve-vector-bits-min=256 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
; RUN: llc -aarch64-sve-vector-bits-min=2048 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; Test we can code generater patterns of the form:
; fixed_length_vector = ISD::EXTRACT_SUBVECTOR scalable_vector, 0
@ -28,7 +14,7 @@
target triple = "aarch64-unknown-linux-gnu"
define void @subvector_v8i16(<8 x i16> *%in, <8 x i16>* %out) #0 {
define void @subvector_v8i16(<8 x i16> *%in, <8 x i16>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: subvector_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@ -42,7 +28,7 @@ bb1:
ret void
}
define void @subvector_v16i16(<16 x i16> *%in, <16 x i16>* %out) #0 {
define void @subvector_v16i16(<16 x i16> *%in, <16 x i16>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: subvector_v16i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -82,29 +68,13 @@ bb1:
ret void
}
define void @subvector_v64i16(<64 x i16> *%in, <64 x i16>* %out) #0 {
; VBITS_GE_256-LABEL: subvector_v64i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #48
; VBITS_GE_256-NEXT: mov x9, #32
; VBITS_GE_256-NEXT: mov x10, #16
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: subvector_v64i16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @subvector_v64i16(<64 x i16> *%in, <64 x i16>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: subvector_v64i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; CHECK-NEXT: ret
%a = load <64 x i16>, <64 x i16>* %in
br label %bb1
@ -113,7 +83,7 @@ bb1:
ret void
}
define void @subvector_v8i32(<8 x i32> *%in, <8 x i32>* %out) #0 {
define void @subvector_v8i32(<8 x i32> *%in, <8 x i32>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: subvector_v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@ -153,29 +123,13 @@ bb1:
ret void
}
define void @subvector_v32i32(<32 x i32> *%in, <32 x i32>* %out) #0 {
; VBITS_GE_256-LABEL: subvector_v32i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #24
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: mov x10, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: subvector_v32i32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @subvector_v32i32(<32 x i32> *%in, <32 x i32>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: subvector_v32i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%a = load <32 x i32>, <32 x i32>* %in
br label %bb1
@ -184,41 +138,13 @@ bb1:
ret void
}
define void @subvector_v64i32(<64 x i32> *%in, <64 x i32>* %out) #0 {
; VBITS_GE_256-LABEL: subvector_v64i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #56
; VBITS_GE_256-NEXT: mov x9, #48
; VBITS_GE_256-NEXT: mov x10, #40
; VBITS_GE_256-NEXT: mov x11, #32
; VBITS_GE_256-NEXT: mov x12, #24
; VBITS_GE_256-NEXT: mov x13, #16
; VBITS_GE_256-NEXT: mov x14, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x13, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0, x14, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x13, lsl #2]
; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1, x14, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: subvector_v64i32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
define void @subvector_v64i32(<64 x i32> *%in, <64 x i32>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: subvector_v64i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%a = load <64 x i32>, <64 x i32>* %in
br label %bb1
@ -228,23 +154,16 @@ bb1:
}
define void @subvector_v8i64(<8 x i64> *%in, <8 x i64>* %out) #0 {
; VBITS_GE_256-LABEL: subvector_v8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: subvector_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret
define void @subvector_v8i64(<8 x i64> *%in, <8 x i64>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: subvector_v8i64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x8, #4
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
; CHECK-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; CHECK-NEXT: st1d { z1.d }, p0, [x1]
; CHECK-NEXT: ret
%a = load <8 x i64>, <8 x i64>* %in
br label %bb1
@ -253,29 +172,13 @@ bb1:
ret void
}
define void @subvector_v16i64(<16 x i64> *%in, <16 x i64>* %out) #0 {
; VBITS_GE_256-LABEL: subvector_v16i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #12
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: mov x10, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: subvector_v16i64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @subvector_v16i64(<16 x i64> *%in, <16 x i64>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: subvector_v16i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%a = load <16 x i64>, <16 x i64>* %in
br label %bb1
@ -284,41 +187,13 @@ bb1:
ret void
}
define void @subvector_v32i64(<32 x i64> *%in, <32 x i64>* %out) #0 {
; VBITS_GE_256-LABEL: subvector_v32i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #28
; VBITS_GE_256-NEXT: mov x9, #24
; VBITS_GE_256-NEXT: mov x10, #20
; VBITS_GE_256-NEXT: mov x11, #16
; VBITS_GE_256-NEXT: mov x12, #12
; VBITS_GE_256-NEXT: mov x13, #8
; VBITS_GE_256-NEXT: mov x14, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x11, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x13, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0, x14, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x13, lsl #3]
; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x12, lsl #3]
; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1, x14, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: subvector_v32i64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
define void @subvector_v32i64(<32 x i64> *%in, <32 x i64>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: subvector_v32i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%a = load <32 x i64>, <32 x i64>* %in
br label %bb1
@ -327,7 +202,7 @@ bb1:
ret void
}
define void @subvector_v8f16(<8 x half> *%in, <8 x half>* %out) #0 {
define void @subvector_v8f16(<8 x half> *%in, <8 x half>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: subvector_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
@ -341,7 +216,7 @@ bb1:
ret void
}
define void @subvector_v16f16(<16 x half> *%in, <16 x half>* %out) #0 {
define void @subvector_v16f16(<16 x half> *%in, <16 x half>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: subvector_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
@ -381,29 +256,13 @@ bb1:
ret void
}
define void @subvector_v64f16(<64 x half> *%in, <64 x half>* %out) #0 {
; VBITS_GE_256-LABEL: subvector_v64f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #48
; VBITS_GE_256-NEXT: mov x9, #32
; VBITS_GE_256-NEXT: mov x10, #16
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: subvector_v64f16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @subvector_v64f16(<64 x half> *%in, <64 x half>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: subvector_v64f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; CHECK-NEXT: ret
%a = load <64 x half>, <64 x half>* %in
br label %bb1
@ -412,7 +271,7 @@ bb1:
ret void
}
define void @subvector_v8f32(<8 x float> *%in, <8 x float>* %out) #0 {
define void @subvector_v8f32(<8 x float> *%in, <8 x float>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: subvector_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
@ -452,29 +311,13 @@ bb1:
ret void
}
define void @subvector_v32f32(<32 x float> *%in, <32 x float>* %out) #0 {
; VBITS_GE_256-LABEL: subvector_v32f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #24
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: mov x10, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: subvector_v32f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @subvector_v32f32(<32 x float> *%in, <32 x float>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: subvector_v32f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%a = load <32 x float>, <32 x float>* %in
br label %bb1
@ -483,41 +326,13 @@ bb1:
ret void
}
define void @subvector_v64f32(<64 x float> *%in, <64 x float>* %out) #0 {
; VBITS_GE_256-LABEL: subvector_v64f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #56
; VBITS_GE_256-NEXT: mov x9, #48
; VBITS_GE_256-NEXT: mov x10, #40
; VBITS_GE_256-NEXT: mov x11, #32
; VBITS_GE_256-NEXT: mov x12, #24
; VBITS_GE_256-NEXT: mov x13, #16
; VBITS_GE_256-NEXT: mov x14, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x13, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0, x14, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x13, lsl #2]
; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1, x14, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: subvector_v64f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
define void @subvector_v64f32(<64 x float> *%in, <64 x float>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: subvector_v64f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%a = load <64 x float>, <64 x float>* %in
br label %bb1
@ -550,29 +365,13 @@ bb1:
ret void
}
define void @subvector_v16f64(<16 x double> *%in, <16 x double>* %out) #0 {
; VBITS_GE_256-LABEL: subvector_v16f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #12
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: mov x10, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: subvector_v16f64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @subvector_v16f64(<16 x double> *%in, <16 x double>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: subvector_v16f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%a = load <16 x double>, <16 x double>* %in
br label %bb1
@ -581,41 +380,13 @@ bb1:
ret void
}
define void @subvector_v32f64(<32 x double> *%in, <32 x double>* %out) #0 {
; VBITS_GE_256-LABEL: subvector_v32f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #28
; VBITS_GE_256-NEXT: mov x9, #24
; VBITS_GE_256-NEXT: mov x10, #20
; VBITS_GE_256-NEXT: mov x11, #16
; VBITS_GE_256-NEXT: mov x12, #12
; VBITS_GE_256-NEXT: mov x13, #8
; VBITS_GE_256-NEXT: mov x14, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x11, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x13, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0, x14, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x13, lsl #3]
; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x12, lsl #3]
; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1, x14, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: subvector_v32f64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
define void @subvector_v32f64(<32 x double> *%in, <32 x double>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: subvector_v32f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%a = load <32 x double>, <32 x double>* %in
br label %bb1

View File

@ -1,43 +1,30 @@
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE when its registers are no bigger than NEON.
; NO_SVE-NOT: ptrue
define void @store_trunc_v2i64i8(<2 x i64>* %ap, <2 x i8>* %dest) #0 {
; CHECK-LABEL: store_trunc_v2i64i8
; CHECK: ldr q[[Q0:[0-9]+]], [x0]
; CHECK: ptrue p[[P0:[0-9]+]].d, vl2
; CHECK-NEXT: st1b { z[[Q0]].d }, p[[P0]], [x1]
; CHECK-NEXT: ret
define void @store_trunc_v2i64i8(<2 x i64>* %ap, <2 x i8>* %dest) vscale_range(2,0) #0 {
; CHECK-LABEL: store_trunc_v2i64i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: st1b { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%a = load <2 x i64>, <2 x i64>* %ap
%val = trunc <2 x i64> %a to <2 x i8>
store <2 x i8> %val, <2 x i8>* %dest
ret void
}
define void @store_trunc_v4i64i8(<4 x i64>* %ap, <4 x i8>* %dest) #0 {
; CHECK-LABEL: store_trunc_v4i64i8
; CHECK: ptrue p[[P0:[0-9]+]].d, vl4
; CHECK-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
; CHECK-NEXT: st1b { z[[Q0]].d }, p[[P0]], [x1]
; CHECK-NEXT: ret
define void @store_trunc_v4i64i8(<4 x i64>* %ap, <4 x i8>* %dest) vscale_range(2,0) #0 {
; CHECK-LABEL: store_trunc_v4i64i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: st1b { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%a = load <4 x i64>, <4 x i64>* %ap
%val = trunc <4 x i64> %a to <4 x i8>
store <4 x i8> %val, <4 x i8>* %dest
@ -45,48 +32,52 @@ define void @store_trunc_v4i64i8(<4 x i64>* %ap, <4 x i8>* %dest) #0 {
}
define void @store_trunc_v8i64i8(<8 x i64>* %ap, <8 x i8>* %dest) #0 {
; CHECK-LABEL: store_trunc_v8i64i8:
; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1b { [[Z0]].d }, p[[P0]], [x1]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation
; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
; VBITS_EQ_256-DAG: ld1d { [[DWORDS_LO:z[0-9]+]].d }, [[PG1]]/z, [x0]
; VBITS_EQ_256-DAG: ld1d { [[DWORDS_HI:z[0-9]+]].d }, [[PG1]]/z, [x0, x[[NUMELTS]], lsl #3]
; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl4
; VBITS_EQ_256-DAG: uzp1 [[WORDS_LO:z[0-9]+]].s, [[DWORDS_LO]].s, [[DWORDS_LO]].s
; VBITS_EQ_256-DAG: uzp1 [[WORDS_HI:z[0-9]+]].s, [[DWORDS_HI]].s, [[DWORDS_HI]].s
; VBITS_EQ_256-DAG: splice [[WORDS:z[0-9]+]].s, [[PG2]], [[WORDS_LO]].s, [[WORDS_HI]].s
; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].s, vl8
; VBITS_EQ_256-NEXT: st1b { [[WORDS]].s }, [[PG3]], [x1]
; VBITS_EQ_256-NEXT: ret
; VBITS_GE_256-LABEL: store_trunc_v8i64i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.s, vl4
; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: st1b { z1.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: store_trunc_v8i64i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1b { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%a = load <8 x i64>, <8 x i64>* %ap
%val = trunc <8 x i64> %a to <8 x i8>
store <8 x i8> %val, <8 x i8>* %dest
ret void
}
define void @store_trunc_v16i64i8(<16 x i64>* %ap, <16 x i8>* %dest) #0 {
define void @store_trunc_v16i64i8(<16 x i64>* %ap, <16 x i8>* %dest) vscale_range(8,0) #0 {
; CHECK-LABEL: store_trunc_v16i64i8:
; VBITS_GE_1024: ptrue p[[P0:[0-9]+]].d, vl16
; VBITS_GE_1024-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1b { [[Z0]].d }, p[[P0]], [x1]
; VBITS_GE_1024-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: st1b { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%a = load <16 x i64>, <16 x i64>* %ap
%val = trunc <16 x i64> %a to <16 x i8>
store <16 x i8> %val, <16 x i8>* %dest
ret void
}
define void @store_trunc_v32i64i8(<32 x i64>* %ap, <32 x i8>* %dest) #0 {
define void @store_trunc_v32i64i8(<32 x i64>* %ap, <32 x i8>* %dest) vscale_range(16,0) #0 {
; CHECK-LABEL: store_trunc_v32i64i8:
; VBITS_GE_2048: ptrue p[[P0:[0-9]+]].d, vl32
; VBITS_GE_2048-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: st1b { [[Z0]].d }, p[[P0]], [x1]
; VBITS_GE_2048-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: st1b { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%a = load <32 x i64>, <32 x i64>* %ap
%val = trunc <32 x i64> %a to <32 x i8>
store <32 x i8> %val, <32 x i8>* %dest
@ -94,25 +85,27 @@ define void @store_trunc_v32i64i8(<32 x i64>* %ap, <32 x i8>* %dest) #0 {
}
define void @store_trunc_v8i64i16(<8 x i64>* %ap, <8 x i16>* %dest) #0 {
; CHECK-LABEL: store_trunc_v8i64i16:
; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1h { [[Z0]].d }, p[[P0]], [x1]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation.
; Currently does not use the truncating store
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
; VBITS_EQ_256-DAG: ld1d { [[DWORDS_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_EQ_256-DAG: ld1d { [[DWORDS_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
; VBITS_EQ_256-DAG: uzp1 [[WORDS_LO:z[0-9]+]].s, [[DWORDS_LO]].s, [[DWORDS_LO]].s
; VBITS_EQ_256-DAG: uzp1 [[WORDS_HI:z[0-9]+]].s, [[DWORDS_HI]].s, [[DWORDS_HI]].s
; VBITS_EQ_256-DAG: uzp1 z[[HALFS_LO:[0-9]+]].h, [[WORDS_LO]].h, [[WORDS_LO]].h
; VBITS_EQ_256-DAG: uzp1 z[[HALFS_HI:[0-9]+]].h, [[WORDS_HI]].h, [[WORDS_HI]].h
; VBITS_EQ_256-NEXT: mov v[[HALFS_LO]].d[1], v[[HALFS_HI]].d[0]
; VBITS_EQ_256-NEXT: str q[[HALFS_LO]], [x1]
; VBITS_EQ_256-NEXT: ret
; VBITS_GE_256-LABEL: store_trunc_v8i64i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
; VBITS_GE_256-NEXT: str q1, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: store_trunc_v8i64i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1h { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%a = load <8 x i64>, <8 x i64>* %ap
%val = trunc <8 x i64> %a to <8 x i16>
store <8 x i16> %val, <8 x i16>* %dest
@ -120,24 +113,26 @@ define void @store_trunc_v8i64i16(<8 x i64>* %ap, <8 x i16>* %dest) #0 {
}
define void @store_trunc_v8i64i32(<8 x i64>* %ap, <8 x i32>* %dest) #0 {
; CHECK-LABEL: store_trunc_v8i64i32:
; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1w { [[Z0]].d }, p[[P0]], [x1]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation
; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
; VBITS_EQ_256-DAG: ld1d { [[DWORDS_LO:z[0-9]+]].d }, [[PG1]]/z, [x0]
; VBITS_EQ_256-DAG: ld1d { [[DWORDS_HI:z[0-9]+]].d }, [[PG1]]/z, [x0, x[[NUMELTS]], lsl #3]
; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl4
; VBITS_EQ_256-DAG: uzp1 [[WORDS_LO:z[0-9]+]].s, [[DWORDS_LO]].s, [[DWORDS_LO]].s
; VBITS_EQ_256-DAG: uzp1 [[WORDS_HI:z[0-9]+]].s, [[DWORDS_HI]].s, [[DWORDS_HI]].s
; VBITS_EQ_256-DAG: splice [[WORDS:z[0-9]+]].s, [[PG1]], [[WORDS_LO]].s, [[WORDS_HI]].s
; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].s, vl8
; VBITS_EQ_256-NEXT: st1w { [[WORDS]].s }, [[PG3]], [x1]
; VBITS_EQ_256-NEXT: ret
; VBITS_GE_256-LABEL: store_trunc_v8i64i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.s, vl4
; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: store_trunc_v8i64i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%a = load <8 x i64>, <8 x i64>* %ap
%val = trunc <8 x i64> %a to <8 x i32>
store <8 x i32> %val, <8 x i32>* %dest
@ -145,25 +140,27 @@ define void @store_trunc_v8i64i32(<8 x i64>* %ap, <8 x i32>* %dest) #0 {
}
define void @store_trunc_v16i32i8(<16 x i32>* %ap, <16 x i8>* %dest) #0 {
; CHECK-LABEL: store_trunc_v16i32i8:
; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16
; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1b { [[Z0]].s }, p[[P0]], [x1]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation.
; Currently does not use the truncating store
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
; VBITS_EQ_256-DAG: ld1w { [[WORDS_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_EQ_256-DAG: ld1w { [[WORDS_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
; VBITS_EQ_256-DAG: uzp1 [[HALFS_LO:z[0-9]+]].h, [[WORDS_LO]].h, [[WORDS_LO]].h
; VBITS_EQ_256-DAG: uzp1 [[HALFS_HI:z[0-9]+]].h, [[WORDS_HI]].h, [[WORDS_HI]].h
; VBITS_EQ_256-DAG: uzp1 z[[BYTES_LO:[0-9]+]].b, [[HALFS_LO]].b, [[HALFS_LO]].b
; VBITS_EQ_256-DAG: uzp1 z[[BYTES_HI:[0-9]+]].b, [[HALFS_HI]].b, [[HALFS_HI]].b
; VBITS_EQ_256-NEXT: mov v[[BYTES_LO]].d[1], v[[BYTES_HI]].d[0]
; VBITS_EQ_256-NEXT: str q[[BYTES_LO]], [x1]
; VBITS_EQ_256-NEXT: ret
; VBITS_GE_256-LABEL: store_trunc_v16i32i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
; VBITS_GE_256-NEXT: str q1, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: store_trunc_v16i32i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1b { z0.s }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%a = load <16 x i32>, <16 x i32>* %ap
%val = trunc <16 x i32> %a to <16 x i8>
store <16 x i8> %val, <16 x i8>* %dest
@ -171,24 +168,26 @@ define void @store_trunc_v16i32i8(<16 x i32>* %ap, <16 x i8>* %dest) #0 {
}
define void @store_trunc_v16i32i16(<16 x i32>* %ap, <16 x i16>* %dest) #0 {
; CHECK-LABEL: store_trunc_v16i32i16:
; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16
; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1h { [[Z0]].s }, p[[P0]], [x1]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation
; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
; VBITS_EQ_256-DAG: ld1w { [[WORDS_LO:z[0-9]+]].s }, [[PG1]]/z, [x0]
; VBITS_EQ_256-DAG: ld1w { [[WORDS_HI:z[0-9]+]].s }, [[PG1]]/z, [x0, x[[NUMELTS]], lsl #2]
; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].h, vl8
; VBITS_EQ_256-DAG: uzp1 [[HALFS_LO:z[0-9]+]].h, [[WORDS_LO]].h, [[WORDS_LO]].h
; VBITS_EQ_256-DAG: uzp1 [[HALFS_HI:z[0-9]+]].h, [[WORDS_HI]].h, [[WORDS_HI]].h
; VBITS_EQ_256-DAG: splice [[HALFS:z[0-9]+]].h, [[PG2]], [[HALFS_LO]].h, [[HALFS_HI]].h
; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].h, vl16
; VBITS_EQ_256-NEXT: st1h { [[HALFS]].h }, [[PG3]], [x1]
; VBITS_EQ_256-NEXT: ret
; VBITS_GE_256-LABEL: store_trunc_v16i32i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.h, vl8
; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: store_trunc_v16i32i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%a = load <16 x i32>, <16 x i32>* %ap
%val = trunc <16 x i32> %a to <16 x i16>
store <16 x i16> %val, <16 x i16>* %dest
@ -196,24 +195,26 @@ define void @store_trunc_v16i32i16(<16 x i32>* %ap, <16 x i16>* %dest) #0 {
}
define void @store_trunc_v32i16i8(<32 x i16>* %ap, <32 x i8>* %dest) #0 {
; CHECK-LABEL: store_trunc_v32i16i8:
; VBITS_GE_512: ptrue p[[P0:[0-9]+]].h, vl32
; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1b { [[Z0]].h }, p[[P0]], [x1]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation
; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].h, vl16
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
; VBITS_EQ_256-DAG: ld1h { [[HALFS_LO:z[0-9]+]].h }, [[PG1]]/z, [x0]
; VBITS_EQ_256-DAG: ld1h { [[HALFS_HI:z[0-9]+]].h }, [[PG1]]/z, [x0, x[[NUMELTS]], lsl #1]
; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].b, vl16
; VBITS_EQ_256-DAG: uzp1 [[BYTES_LO:z[0-9]+]].b, [[HALFS_LO]].b, [[HALFS_LO]].b
; VBITS_EQ_256-DAG: uzp1 [[BYTES_HI:z[0-9]+]].b, [[HALFS_HI]].b, [[HALFS_HI]].b
; VBITS_EQ_256-DAG: splice [[BYTES:z[0-9]+]].b, [[PG2]], [[BYTES_LO]].b, [[BYTES_HI]].b
; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].b, vl32
; VBITS_EQ_256-NEXT: st1b { [[BYTES]].b }, [[PG3]], [x1]
; VBITS_EQ_256-NEXT: ret
; VBITS_GE_256-LABEL: store_trunc_v32i16i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.b, vl16
; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
; VBITS_GE_256-NEXT: splice z1.b, p0, z1.b, z0.b
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: store_trunc_v32i16i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1b { z0.h }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%a = load <32 x i16>, <32 x i16>* %ap
%val = trunc <32 x i16> %a to <32 x i8>
store <32 x i8> %val, <32 x i8>* %dest

View File

@ -1,35 +1,22 @@
; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE when its registers are no bigger than NEON.
; NO_SVE-NOT: z{0-9}
;
; truncate i16 -> i8
;
define <16 x i8> @trunc_v16i16_v16i8(<16 x i16>* %in) #0 {
define <16 x i8> @trunc_v16i16_v16i8(<16 x i16>* %in) vscale_range(2,0) #0 {
; CHECK-LABEL: trunc_v16i16_v16i8:
; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
; CHECK-NEXT: ld1h { [[A_HALFS:z[0-9]+]].h }, [[PG]]/z, [x0]
; CHECK-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%a = load <16 x i16>, <16 x i16>* %in
%b = trunc <16 x i16> %a to <16 x i8>
ret <16 x i8> %b
@ -37,11 +24,30 @@ define <16 x i8> @trunc_v16i16_v16i8(<16 x i16>* %in) #0 {
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v32i16_v32i8(<32 x i16>* %in, <32 x i8>* %out) #0 {
; CHECK-LABEL: trunc_v32i16_v32i8:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
; VBITS_GE_512: ld1h { [[A_HALFS:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_512: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b
; VBITS_GE_512: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b
; VBITS_GE_256-LABEL: trunc_v32i16_v32i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.b, vl16
; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
; VBITS_GE_256-NEXT: splice z1.b, p0, z1.b, z0.b
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: add z0.b, z1.b, z1.b
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: trunc_v32i16_v32i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ptrue p0.b, vl32
; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b
; VBITS_GE_512-NEXT: add z0.b, z0.b, z0.b
; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%a = load <32 x i16>, <32 x i16>* %in
%b = trunc <32 x i16> %a to <32 x i8>
%c = add <32 x i8> %b, %b
@ -50,12 +56,16 @@ define void @trunc_v32i16_v32i8(<32 x i16>* %in, <32 x i8>* %out) #0 {
}
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v64i16_v64i8(<64 x i16>* %in, <64 x i8>* %out) #0 {
define void @trunc_v64i16_v64i8(<64 x i16>* %in, <64 x i8>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: trunc_v64i16_v64i8:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
; VBITS_GE_1024: ld1h { [[A_HALFS:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_1024: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b
; VBITS_GE_1024: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ptrue p0.b, vl64
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: add z0.b, z0.b, z0.b
; CHECK-NEXT: st1b { z0.b }, p0, [x1]
; CHECK-NEXT: ret
%a = load <64 x i16>, <64 x i16>* %in
%b = trunc <64 x i16> %a to <64 x i8>
%c = add <64 x i8> %b, %b
@ -64,12 +74,16 @@ define void @trunc_v64i16_v64i8(<64 x i16>* %in, <64 x i8>* %out) #0 {
}
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v128i16_v128i8(<128 x i16>* %in, <128 x i8>* %out) #0 {
define void @trunc_v128i16_v128i8(<128 x i16>* %in, <128 x i8>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: trunc_v128i16_v128i8:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
; VBITS_GE_2048: ld1h { [[A_HALFS:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_2048: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b
; VBITS_GE_2048: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ptrue p0.b, vl128
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: add z0.b, z0.b, z0.b
; CHECK-NEXT: st1b { z0.b }, p0, [x1]
; CHECK-NEXT: ret
%a = load <128 x i16>, <128 x i16>* %in
%b = trunc <128 x i16> %a to <128 x i8>
%c = add <128 x i8> %b, %b
@ -81,38 +95,60 @@ define void @trunc_v128i16_v128i8(<128 x i16>* %in, <128 x i8>* %out) #0 {
; truncate i32 -> i8
;
define <8 x i8> @trunc_v8i32_v8i8(<8 x i32>* %in) #0 {
define <8 x i8> @trunc_v8i32_v8i8(<8 x i32>* %in) vscale_range(2,0) #0 {
; CHECK-LABEL: trunc_v8i32_v8i8:
; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
; CHECK-NEXT: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0]
; CHECK-NEXT: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
; CHECK-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%a = load <8 x i32>, <8 x i32>* %in
%b = trunc <8 x i32> %a to <8 x i8>
ret <8 x i8> %b
}
define <16 x i8> @trunc_v16i32_v16i8(<16 x i32>* %in) #0 {
; CHECK-LABEL: trunc_v16i32_v16i8:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
; VBITS_GE_512-NEXT: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_512-NEXT: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
; VBITS_GE_512-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b
; VBITS_GE_512-NEXT: ret
; VBITS_GE_256-LABEL: trunc_v16i32_v16i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
; VBITS_GE_256-NEXT: uzp1 z2.b, z0.b, z0.b
; VBITS_GE_256-NEXT: uzp1 z0.b, z1.b, z1.b
; VBITS_GE_256-NEXT: mov v0.d[1], v2.d[0]
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: trunc_v16i32_v16i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b
; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_512-NEXT: ret
%a = load <16 x i32>, <16 x i32>* %in
%b = trunc <16 x i32> %a to <16 x i8>
ret <16 x i8> %b
}
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v32i32_v32i8(<32 x i32>* %in, <32 x i8>* %out) #0 {
define void @trunc_v32i32_v32i8(<32 x i32>* %in, <32 x i8>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: trunc_v32i32_v32i8:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
; VBITS_GE_1024: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_1024: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
; VBITS_GE_1024: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b
; VBITS_GE_1024: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ptrue p0.b, vl32
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: add z0.b, z0.b, z0.b
; CHECK-NEXT: st1b { z0.b }, p0, [x1]
; CHECK-NEXT: ret
%a = load <32 x i32>, <32 x i32>* %in
%b = trunc <32 x i32> %a to <32 x i8>
%c = add <32 x i8> %b, %b
@ -121,13 +157,17 @@ define void @trunc_v32i32_v32i8(<32 x i32>* %in, <32 x i8>* %out) #0 {
}
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v64i32_v64i8(<64 x i32>* %in, <64 x i8>* %out) #0 {
define void @trunc_v64i32_v64i8(<64 x i32>* %in, <64 x i8>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: trunc_v64i32_v64i8:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
; VBITS_GE_2048: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_2048: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
; VBITS_GE_2048: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b
; VBITS_GE_2048: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ptrue p0.b, vl64
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: add z0.b, z0.b, z0.b
; CHECK-NEXT: st1b { z0.b }, p0, [x1]
; CHECK-NEXT: ret
%a = load <64 x i32>, <64 x i32>* %in
%b = trunc <64 x i32> %a to <64 x i8>
%c = add <64 x i8> %b, %b
@ -139,12 +179,14 @@ define void @trunc_v64i32_v64i8(<64 x i32>* %in, <64 x i8>* %out) #0 {
; truncate i32 -> i16
;
define <8 x i16> @trunc_v8i32_v8i16(<8 x i32>* %in) #0 {
define <8 x i16> @trunc_v8i32_v8i16(<8 x i32>* %in) vscale_range(2,0) #0 {
; CHECK-LABEL: trunc_v8i32_v8i16:
; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
; CHECK-NEXT: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0]
; CHECK-NEXT: uzp1 z0.h, [[A_WORDS]].h, [[A_WORDS]].h
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%a = load <8 x i32>, <8 x i32>* %in
%b = trunc <8 x i32> %a to <8 x i16>
ret <8 x i16> %b
@ -152,11 +194,30 @@ define <8 x i16> @trunc_v8i32_v8i16(<8 x i32>* %in) #0 {
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v16i32_v16i16(<16 x i32>* %in, <16 x i16>* %out) #0 {
; CHECK-LABEL: trunc_v16i32_v16i16:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
; VBITS_GE_512: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_512: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
; VBITS_GE_512: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h
; VBITS_GE_256-LABEL: trunc_v16i32_v16i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.h, vl8
; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: add z0.h, z1.h, z1.h
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: trunc_v16i32_v16i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ptrue p0.h, vl16
; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_512-NEXT: add z0.h, z0.h, z0.h
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%a = load <16 x i32>, <16 x i32>* %in
%b = trunc <16 x i32> %a to <16 x i16>
%c = add <16 x i16> %b, %b
@ -165,12 +226,16 @@ define void @trunc_v16i32_v16i16(<16 x i32>* %in, <16 x i16>* %out) #0 {
}
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v32i32_v32i16(<32 x i32>* %in, <32 x i16>* %out) #0 {
define void @trunc_v32i32_v32i16(<32 x i32>* %in, <32 x i16>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: trunc_v32i32_v32i16:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
; VBITS_GE_1024: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_1024: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
; VBITS_GE_1024: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ptrue p0.h, vl32
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: add z0.h, z0.h, z0.h
; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; CHECK-NEXT: ret
%a = load <32 x i32>, <32 x i32>* %in
%b = trunc <32 x i32> %a to <32 x i16>
%c = add <32 x i16> %b, %b
@ -179,12 +244,16 @@ define void @trunc_v32i32_v32i16(<32 x i32>* %in, <32 x i16>* %out) #0 {
}
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v64i32_v64i16(<64 x i32>* %in, <64 x i16>* %out) #0 {
define void @trunc_v64i32_v64i16(<64 x i32>* %in, <64 x i16>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: trunc_v64i32_v64i16:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
; VBITS_GE_2048: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_2048: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
; VBITS_GE_2048: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: add z0.h, z0.h, z0.h
; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; CHECK-NEXT: ret
%a = load <64 x i32>, <64 x i32>* %in
%b = trunc <64 x i32> %a to <64 x i16>
%c = add <64 x i16> %b, %b
@ -197,53 +266,78 @@ define void @trunc_v64i32_v64i16(<64 x i32>* %in, <64 x i16>* %out) #0 {
;
; NOTE: v4i8 is not legal so result i8 elements are held within i16 containers.
define <4 x i8> @trunc_v4i64_v4i8(<4 x i64>* %in) #0 {
define <4 x i8> @trunc_v4i64_v4i8(<4 x i64>* %in) vscale_range(2,0) #0 {
; CHECK-LABEL: trunc_v4i64_v4i8:
; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
; CHECK-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
; CHECK-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
; CHECK-NEXT: uzp1 z0.h, [[A_WORDS]].h, [[A_WORDS]].h
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%a = load <4 x i64>, <4 x i64>* %in
%b = trunc <4 x i64> %a to <4 x i8>
ret <4 x i8> %b
}
define <8 x i8> @trunc_v8i64_v8i8(<8 x i64>* %in) #0 {
; CHECK-LABEL: trunc_v8i64_v8i8:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
; VBITS_GE_512-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_512-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
; VBITS_GE_512-NEXT: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
; VBITS_GE_512-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b
; VBITS_GE_512-NEXT: ret
; VBITS_GE_256-LABEL: trunc_v8i64_v8i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.s, vl4
; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s
; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: trunc_v8i64_v8i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b
; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0
; VBITS_GE_512-NEXT: ret
%a = load <8 x i64>, <8 x i64>* %in
%b = trunc <8 x i64> %a to <8 x i8>
ret <8 x i8> %b
}
define <16 x i8> @trunc_v16i64_v16i8(<16 x i64>* %in) #0 {
define <16 x i8> @trunc_v16i64_v16i8(<16 x i64>* %in) vscale_range(8,0) #0 {
; CHECK-LABEL: trunc_v16i64_v16i8:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
; VBITS_GE_1024-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_1024-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
; VBITS_GE_1024-NEXT: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
; VBITS_GE_1024-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b
; VBITS_GE_1024-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%a = load <16 x i64>, <16 x i64>* %in
%b = trunc <16 x i64> %a to <16 x i8>
ret <16 x i8> %b
}
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v32i64_v32i8(<32 x i64>* %in, <32 x i8>* %out) #0 {
define void @trunc_v32i64_v32i8(<32 x i64>* %in, <32 x i8>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: trunc_v32i64_v32i8:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
; VBITS_GE_2048: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_2048: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
; VBITS_GE_2048: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
; VBITS_GE_2048: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b
; VBITS_GE_2048: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ptrue p0.b, vl32
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: add z0.b, z0.b, z0.b
; CHECK-NEXT: st1b { z0.b }, p0, [x1]
; CHECK-NEXT: ret
%a = load <32 x i64>, <32 x i64>* %in
%b = trunc <32 x i64> %a to <32 x i8>
%c = add <32 x i8> %b, %b
@ -255,38 +349,60 @@ define void @trunc_v32i64_v32i8(<32 x i64>* %in, <32 x i8>* %out) #0 {
; truncate i64 -> i16
;
define <4 x i16> @trunc_v4i64_v4i16(<4 x i64>* %in) #0 {
define <4 x i16> @trunc_v4i64_v4i16(<4 x i64>* %in) vscale_range(2,0) #0 {
; CHECK-LABEL: trunc_v4i64_v4i16:
; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
; CHECK-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
; CHECK-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
; CHECK-NEXT: uzp1 z0.h, [[A_WORDS]].h, [[A_WORDS]].h
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%a = load <4 x i64>, <4 x i64>* %in
%b = trunc <4 x i64> %a to <4 x i16>
ret <4 x i16> %b
}
define <8 x i16> @trunc_v8i64_v8i16(<8 x i64>* %in) #0 {
; CHECK-LABEL: trunc_v8i64_v8i16:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
; VBITS_GE_512-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_512-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
; VBITS_GE_512-NEXT: uzp1 z0.h, [[A_WORDS]].h, [[A_WORDS]].h
; VBITS_GE_512-NEXT: ret
; VBITS_GE_256-LABEL: trunc_v8i64_v8i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
; VBITS_GE_256-NEXT: uzp1 z2.h, z0.h, z0.h
; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
; VBITS_GE_256-NEXT: mov v0.d[1], v2.d[0]
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: trunc_v8i64_v8i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_512-NEXT: ret
%a = load <8 x i64>, <8 x i64>* %in
%b = trunc <8 x i64> %a to <8 x i16>
ret <8 x i16> %b
}
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v16i64_v16i16(<16 x i64>* %in, <16 x i16>* %out) #0 {
define void @trunc_v16i64_v16i16(<16 x i64>* %in, <16 x i16>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: trunc_v16i64_v16i16:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
; VBITS_GE_1024: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_1024: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
; VBITS_GE_1024: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
; VBITS_GE_1024: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: add z0.h, z0.h, z0.h
; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; CHECK-NEXT: ret
%a = load <16 x i64>, <16 x i64>* %in
%b = trunc <16 x i64> %a to <16 x i16>
%c = add <16 x i16> %b, %b
@ -295,13 +411,17 @@ define void @trunc_v16i64_v16i16(<16 x i64>* %in, <16 x i16>* %out) #0 {
}
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v32i64_v32i16(<32 x i64>* %in, <32 x i16>* %out) #0 {
define void @trunc_v32i64_v32i16(<32 x i64>* %in, <32 x i16>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: trunc_v32i64_v32i16:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
; VBITS_GE_2048: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_2048: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
; VBITS_GE_2048: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h
; VBITS_GE_2048: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ptrue p0.h, vl32
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: add z0.h, z0.h, z0.h
; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; CHECK-NEXT: ret
%a = load <32 x i64>, <32 x i64>* %in
%b = trunc <32 x i64> %a to <32 x i16>
%c = add <32 x i16> %b, %b
@ -313,12 +433,14 @@ define void @trunc_v32i64_v32i16(<32 x i64>* %in, <32 x i16>* %out) #0 {
; truncate i64 -> i32
;
define <4 x i32> @trunc_v4i64_v4i32(<4 x i64>* %in) #0 {
define <4 x i32> @trunc_v4i64_v4i32(<4 x i64>* %in) vscale_range(2,0) #0 {
; CHECK-LABEL: trunc_v4i64_v4i32:
; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
; CHECK-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
; CHECK-NEXT: uzp1 z0.s, [[A_DWORDS]].s, [[A_DWORDS]].s
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%a = load <4 x i64>, <4 x i64>* %in
%b = trunc <4 x i64> %a to <4 x i32>
ret <4 x i32> %b
@ -326,11 +448,30 @@ define <4 x i32> @trunc_v4i64_v4i32(<4 x i64>* %in) #0 {
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v8i64_v8i32(<8 x i64>* %in, <8 x i32>* %out) #0 {
; CHECK-LABEL: trunc_v8i64_v8i32:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
; VBITS_GE_512: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_512: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
; VBITS_GE_512: add [[A_WORDS]].s, [[A_WORDS]].s, [[A_WORDS]].s
; VBITS_GE_256-LABEL: trunc_v8i64_v8i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.s, vl4
; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: add z0.s, z1.s, z1.s
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: trunc_v8i64_v8i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ptrue p0.s, vl8
; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_512-NEXT: add z0.s, z0.s, z0.s
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%a = load <8 x i64>, <8 x i64>* %in
%b = trunc <8 x i64> %a to <8 x i32>
%c = add <8 x i32> %b, %b
@ -339,12 +480,16 @@ define void @trunc_v8i64_v8i32(<8 x i64>* %in, <8 x i32>* %out) #0 {
}
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v16i64_v16i32(<16 x i64>* %in, <16 x i32>* %out) #0 {
define void @trunc_v16i64_v16i32(<16 x i64>* %in, <16 x i32>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: trunc_v16i64_v16i32:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
; VBITS_GE_1024: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_1024: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
; VBITS_GE_1024: add [[A_WORDS]].s, [[A_WORDS]].s, [[A_WORDS]].s
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ptrue p0.s, vl16
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: add z0.s, z0.s, z0.s
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%a = load <16 x i64>, <16 x i64>* %in
%b = trunc <16 x i64> %a to <16 x i32>
%c = add <16 x i32> %b, %b
@ -353,12 +498,16 @@ define void @trunc_v16i64_v16i32(<16 x i64>* %in, <16 x i32>* %out) #0 {
}
; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v32i64_v32i32(<32 x i64>* %in, <32 x i32>* %out) #0 {
define void @trunc_v32i64_v32i32(<32 x i64>* %in, <32 x i32>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: trunc_v32i64_v32i32:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
; VBITS_GE_2048: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_2048: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s
; VBITS_GE_2048: add [[A_WORDS]].s, [[A_WORDS]].s, [[A_WORDS]].s
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: add z0.s, z0.s, z0.s
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%a = load <32 x i64>, <32 x i64>* %in
%b = trunc <32 x i64> %a to <32 x i32>
%c = add <32 x i32> %b, %b

File diff suppressed because it is too large Load Diff