[SVE][CodeGen] Restructure SVE fixed length tests to use update_llc_test_checks.

Most tests have been updated to make use of vscale_range to reduce
the number of RUN lines.  For the remaining RUN lines the check
prefixes have been updated to ensure the original expectation of
the manual CHECK lines is maintained after update_llc_test_checks
is run.
This commit is contained in:
Paul Walker 2022-06-13 17:06:22 +01:00
parent af6ec9200b
commit fcd058acc9
47 changed files with 20179 additions and 27069 deletions

File diff suppressed because it is too large Load Diff

View File

@ -1,31 +1,17 @@
; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
target triple = "aarch64-unknown-linux-gnu" target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE when its registers are no bigger than NEON.
; NO_SVE-NOT: ptrue
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define void @bitcast_v4i16(<4 x i16> *%a, <4 x half>* %b) #0 { define void @bitcast_v4i16(<4 x i16> *%a, <4 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v4i16: ; CHECK-LABEL: bitcast_v4i16:
; CHECK: ldr d0, [x0] ; CHECK: // %bb.0:
; CHECK-NEXT: str d0, [x1] ; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ret ; CHECK-NEXT: str d0, [x1]
; CHECK-NEXT: ret
%load = load volatile <4 x i16>, <4 x i16>* %a %load = load volatile <4 x i16>, <4 x i16>* %a
%cast = bitcast <4 x i16> %load to <4 x half> %cast = bitcast <4 x i16> %load to <4 x half>
store volatile <4 x half> %cast, <4 x half>* %b store volatile <4 x half> %cast, <4 x half>* %b
@ -33,23 +19,25 @@ define void @bitcast_v4i16(<4 x i16> *%a, <4 x half>* %b) #0 {
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define void @bitcast_v8i16(<8 x i16> *%a, <8 x half>* %b) #0 { define void @bitcast_v8i16(<8 x i16> *%a, <8 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v8i16: ; CHECK-LABEL: bitcast_v8i16:
; CHECK: ldr q0, [x0] ; CHECK: // %bb.0:
; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ret ; CHECK-NEXT: str q0, [x1]
; CHECK-NEXT: ret
%load = load volatile <8 x i16>, <8 x i16>* %a %load = load volatile <8 x i16>, <8 x i16>* %a
%cast = bitcast <8 x i16> %load to <8 x half> %cast = bitcast <8 x i16> %load to <8 x half>
store volatile <8 x half> %cast, <8 x half>* %b store volatile <8 x half> %cast, <8 x half>* %b
ret void ret void
} }
define void @bitcast_v16i16(<16 x i16> *%a, <16 x half>* %b) #0 { define void @bitcast_v16i16(<16 x i16> *%a, <16 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v16i16: ; CHECK-LABEL: bitcast_v16i16:
; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 ; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: st1h { [[OP]].h }, [[PG]], [x1] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ret ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; CHECK-NEXT: ret
%load = load volatile <16 x i16>, <16 x i16>* %a %load = load volatile <16 x i16>, <16 x i16>* %a
%cast = bitcast <16 x i16> %load to <16 x half> %cast = bitcast <16 x i16> %load to <16 x half>
store volatile <16 x half> %cast, <16 x half>* %b store volatile <16 x half> %cast, <16 x half>* %b
@ -57,35 +45,48 @@ define void @bitcast_v16i16(<16 x i16> *%a, <16 x half>* %b) #0 {
} }
define void @bitcast_v32i16(<32 x i16> *%a, <32 x half>* %b) #0 { define void @bitcast_v32i16(<32 x i16> *%a, <32 x half>* %b) #0 {
; CHECK-LABEL: bitcast_v32i16: ; VBITS_GE_256-LABEL: bitcast_v32i16:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_512-NEXT: st1h { [[OP]].h }, [[PG]], [x1] ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: bitcast_v32i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%load = load volatile <32 x i16>, <32 x i16>* %a %load = load volatile <32 x i16>, <32 x i16>* %a
%cast = bitcast <32 x i16> %load to <32 x half> %cast = bitcast <32 x i16> %load to <32 x half>
store volatile <32 x half> %cast, <32 x half>* %b store volatile <32 x half> %cast, <32 x half>* %b
ret void ret void
} }
define void @bitcast_v64i16(<64 x i16> *%a, <64 x half>* %b) #0 { define void @bitcast_v64i16(<64 x i16> *%a, <64 x half>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: bitcast_v64i16: ; CHECK-LABEL: bitcast_v64i16:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: st1h { [[OP]].h }, [[PG]], [x1] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; CHECK-NEXT: ret
%load = load volatile <64 x i16>, <64 x i16>* %a %load = load volatile <64 x i16>, <64 x i16>* %a
%cast = bitcast <64 x i16> %load to <64 x half> %cast = bitcast <64 x i16> %load to <64 x half>
store volatile <64 x half> %cast, <64 x half>* %b store volatile <64 x half> %cast, <64 x half>* %b
ret void ret void
} }
define void @bitcast_v128i16(<128 x i16> *%a, <128 x half>* %b) #0 { define void @bitcast_v128i16(<128 x i16> *%a, <128 x half>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: bitcast_v128i16: ; CHECK-LABEL: bitcast_v128i16:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048-NEXT: st1h { [[OP]].h }, [[PG]], [x1] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; CHECK-NEXT: ret
%load = load volatile <128 x i16>, <128 x i16>* %a %load = load volatile <128 x i16>, <128 x i16>* %a
%cast = bitcast <128 x i16> %load to <128 x half> %cast = bitcast <128 x i16> %load to <128 x half>
store volatile <128 x half> %cast, <128 x half>* %b store volatile <128 x half> %cast, <128 x half>* %b
@ -93,11 +94,12 @@ define void @bitcast_v128i16(<128 x i16> *%a, <128 x half>* %b) #0 {
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define void @bitcast_v2i32(<2 x i32> *%a, <2 x float>* %b) #0 { define void @bitcast_v2i32(<2 x i32> *%a, <2 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v2i32: ; CHECK-LABEL: bitcast_v2i32:
; CHECK: ldr d0, [x0] ; CHECK: // %bb.0:
; CHECK-NEXT: str d0, [x1] ; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ret ; CHECK-NEXT: str d0, [x1]
; CHECK-NEXT: ret
%load = load volatile <2 x i32>, <2 x i32>* %a %load = load volatile <2 x i32>, <2 x i32>* %a
%cast = bitcast <2 x i32> %load to <2 x float> %cast = bitcast <2 x i32> %load to <2 x float>
store volatile <2 x float> %cast, <2 x float>* %b store volatile <2 x float> %cast, <2 x float>* %b
@ -105,23 +107,25 @@ define void @bitcast_v2i32(<2 x i32> *%a, <2 x float>* %b) #0 {
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define void @bitcast_v4i32(<4 x i32> *%a, <4 x float>* %b) #0 { define void @bitcast_v4i32(<4 x i32> *%a, <4 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v4i32: ; CHECK-LABEL: bitcast_v4i32:
; CHECK: ldr q0, [x0] ; CHECK: // %bb.0:
; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ret ; CHECK-NEXT: str q0, [x1]
; CHECK-NEXT: ret
%load = load volatile <4 x i32>, <4 x i32>* %a %load = load volatile <4 x i32>, <4 x i32>* %a
%cast = bitcast <4 x i32> %load to <4 x float> %cast = bitcast <4 x i32> %load to <4 x float>
store volatile <4 x float> %cast, <4 x float>* %b store volatile <4 x float> %cast, <4 x float>* %b
ret void ret void
} }
define void @bitcast_v8i32(<8 x i32> *%a, <8 x float>* %b) #0 { define void @bitcast_v8i32(<8 x i32> *%a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v8i32: ; CHECK-LABEL: bitcast_v8i32:
; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 ; CHECK: // %bb.0:
; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: st1w { [[OP]].s }, [[PG]], [x1] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ret ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%load = load volatile <8 x i32>, <8 x i32>* %a %load = load volatile <8 x i32>, <8 x i32>* %a
%cast = bitcast <8 x i32> %load to <8 x float> %cast = bitcast <8 x i32> %load to <8 x float>
store volatile <8 x float> %cast, <8 x float>* %b store volatile <8 x float> %cast, <8 x float>* %b
@ -129,35 +133,48 @@ define void @bitcast_v8i32(<8 x i32> *%a, <8 x float>* %b) #0 {
} }
define void @bitcast_v16i32(<16 x i32> *%a, <16 x float>* %b) #0 { define void @bitcast_v16i32(<16 x i32> *%a, <16 x float>* %b) #0 {
; CHECK-LABEL: bitcast_v16i32: ; VBITS_GE_256-LABEL: bitcast_v16i32:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_512-NEXT: st1w { [[OP]].s }, [[PG]], [x1] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: bitcast_v16i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%load = load volatile <16 x i32>, <16 x i32>* %a %load = load volatile <16 x i32>, <16 x i32>* %a
%cast = bitcast <16 x i32> %load to <16 x float> %cast = bitcast <16 x i32> %load to <16 x float>
store volatile <16 x float> %cast, <16 x float>* %b store volatile <16 x float> %cast, <16 x float>* %b
ret void ret void
} }
define void @bitcast_v32i32(<32 x i32> *%a, <32 x float>* %b) #0 { define void @bitcast_v32i32(<32 x i32> *%a, <32 x float>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: bitcast_v32i32: ; CHECK-LABEL: bitcast_v32i32:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: st1w { [[OP]].s }, [[PG]], [x1] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%load = load volatile <32 x i32>, <32 x i32>* %a %load = load volatile <32 x i32>, <32 x i32>* %a
%cast = bitcast <32 x i32> %load to <32 x float> %cast = bitcast <32 x i32> %load to <32 x float>
store volatile <32 x float> %cast, <32 x float>* %b store volatile <32 x float> %cast, <32 x float>* %b
ret void ret void
} }
define void @bitcast_v64i32(<64 x i32> *%a, <64 x float>* %b) #0 { define void @bitcast_v64i32(<64 x i32> *%a, <64 x float>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: bitcast_v64i32: ; CHECK-LABEL: bitcast_v64i32:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: st1w { [[OP]].s }, [[PG]], [x1] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%load = load volatile <64 x i32>, <64 x i32>* %a %load = load volatile <64 x i32>, <64 x i32>* %a
%cast = bitcast <64 x i32> %load to <64 x float> %cast = bitcast <64 x i32> %load to <64 x float>
store volatile <64 x float> %cast, <64 x float>* %b store volatile <64 x float> %cast, <64 x float>* %b
@ -165,11 +182,12 @@ define void @bitcast_v64i32(<64 x i32> *%a, <64 x float>* %b) #0 {
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define void @bitcast_v1i64(<1 x i64> *%a, <1 x double>* %b) #0 { define void @bitcast_v1i64(<1 x i64> *%a, <1 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v1i64: ; CHECK-LABEL: bitcast_v1i64:
; CHECK: ldr d0, [x0] ; CHECK: // %bb.0:
; CHECK-NEXT: str d0, [x1] ; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ret ; CHECK-NEXT: str d0, [x1]
; CHECK-NEXT: ret
%load = load volatile <1 x i64>, <1 x i64>* %a %load = load volatile <1 x i64>, <1 x i64>* %a
%cast = bitcast <1 x i64> %load to <1 x double> %cast = bitcast <1 x i64> %load to <1 x double>
store volatile <1 x double> %cast, <1 x double>* %b store volatile <1 x double> %cast, <1 x double>* %b
@ -177,23 +195,25 @@ define void @bitcast_v1i64(<1 x i64> *%a, <1 x double>* %b) #0 {
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define void @bitcast_v2i64(<2 x i64> *%a, <2 x double>* %b) #0 { define void @bitcast_v2i64(<2 x i64> *%a, <2 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v2i64: ; CHECK-LABEL: bitcast_v2i64:
; CHECK: ldr q0, [x0] ; CHECK: // %bb.0:
; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ret ; CHECK-NEXT: str q0, [x1]
; CHECK-NEXT: ret
%load = load volatile <2 x i64>, <2 x i64>* %a %load = load volatile <2 x i64>, <2 x i64>* %a
%cast = bitcast <2 x i64> %load to <2 x double> %cast = bitcast <2 x i64> %load to <2 x double>
store volatile <2 x double> %cast, <2 x double>* %b store volatile <2 x double> %cast, <2 x double>* %b
ret void ret void
} }
define void @bitcast_v4i64(<4 x i64> *%a, <4 x double>* %b) #0 { define void @bitcast_v4i64(<4 x i64> *%a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: bitcast_v4i64: ; CHECK-LABEL: bitcast_v4i64:
; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 ; CHECK: // %bb.0:
; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: st1d { [[OP]].d }, [[PG]], [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ret ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%load = load volatile <4 x i64>, <4 x i64>* %a %load = load volatile <4 x i64>, <4 x i64>* %a
%cast = bitcast <4 x i64> %load to <4 x double> %cast = bitcast <4 x i64> %load to <4 x double>
store volatile <4 x double> %cast, <4 x double>* %b store volatile <4 x double> %cast, <4 x double>* %b
@ -201,35 +221,48 @@ define void @bitcast_v4i64(<4 x i64> *%a, <4 x double>* %b) #0 {
} }
define void @bitcast_v8i64(<8 x i64> *%a, <8 x double>* %b) #0 { define void @bitcast_v8i64(<8 x i64> *%a, <8 x double>* %b) #0 {
; CHECK-LABEL: bitcast_v8i64: ; VBITS_GE_256-LABEL: bitcast_v8i64:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_512-NEXT: st1d { [[OP]].d }, [[PG]], [x1] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: bitcast_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%load = load volatile <8 x i64>, <8 x i64>* %a %load = load volatile <8 x i64>, <8 x i64>* %a
%cast = bitcast <8 x i64> %load to <8 x double> %cast = bitcast <8 x i64> %load to <8 x double>
store volatile <8 x double> %cast, <8 x double>* %b store volatile <8 x double> %cast, <8 x double>* %b
ret void ret void
} }
define void @bitcast_v16i64(<16 x i64> *%a, <16 x double>* %b) #0 { define void @bitcast_v16i64(<16 x i64> *%a, <16 x double>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: bitcast_v16i64: ; CHECK-LABEL: bitcast_v16i64:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: st1d { [[OP]].d }, [[PG]], [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%load = load volatile <16 x i64>, <16 x i64>* %a %load = load volatile <16 x i64>, <16 x i64>* %a
%cast = bitcast <16 x i64> %load to <16 x double> %cast = bitcast <16 x i64> %load to <16 x double>
store volatile <16 x double> %cast, <16 x double>* %b store volatile <16 x double> %cast, <16 x double>* %b
ret void ret void
} }
define void @bitcast_v32i64(<32 x i64> *%a, <32 x double>* %b) #0 { define void @bitcast_v32i64(<32 x i64> *%a, <32 x double>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: bitcast_v32i64: ; CHECK-LABEL: bitcast_v32i64:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: st1d { [[OP]].d }, [[PG]], [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%load = load volatile <32 x i64>, <32 x i64>* %a %load = load volatile <32 x i64>, <32 x i64>* %a
%cast = bitcast <32 x i64> %load to <32 x double> %cast = bitcast <32 x i64> %load to <32 x double>
store volatile <32 x double> %cast, <32 x double>* %b store volatile <32 x double> %cast, <32 x double>* %b

File diff suppressed because it is too large Load Diff

View File

@ -1,25 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; Don't use SVE when its registers are no bigger than NEON. ; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
target triple = "aarch64-unknown-linux-gnu" target triple = "aarch64-unknown-linux-gnu"
define <4 x i32> @load_zext_v4i16i32(<4 x i16>* %ap) #0 { define <4 x i32> @load_zext_v4i16i32(<4 x i16>* %ap) vscale_range(2,0) #0 {
; CHECK-LABEL: load_zext_v4i16i32: ; CHECK-LABEL: load_zext_v4i16i32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d0, [x0]
@ -49,7 +34,7 @@ define <2 x i256> @load_zext_v2i64i256(<2 x i64>* %ap) #0 {
ret <2 x i256> %val ret <2 x i256> %val
} }
define <8 x i32> @load_zext_v8i16i32(<8 x i16>* %ap) #0 { define <8 x i32> @load_zext_v8i16i32(<8 x i16>* %ap) vscale_range(2,0) #0 {
; CHECK-LABEL: load_zext_v8i16i32: ; CHECK-LABEL: load_zext_v8i16i32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ptrue p0.s, vl8
@ -61,103 +46,43 @@ define <8 x i32> @load_zext_v8i16i32(<8 x i16>* %ap) #0 {
ret <8 x i32> %val ret <8 x i32> %val
} }
define <16 x i32> @load_zext_v16i16i32(<16 x i16>* %ap) #0 { define <16 x i32> @load_zext_v16i16i32(<16 x i16>* %ap) vscale_range(4,0) #0 {
; VBITS_GE_256-LABEL: load_zext_v16i16i32: ; CHECK-LABEL: load_zext_v16i16i32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.s, vl16
; VBITS_GE_256-NEXT: mov x9, #8 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: load_zext_v16i16i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalistaion
%a = load <16 x i16>, <16 x i16>* %ap %a = load <16 x i16>, <16 x i16>* %ap
%val = zext <16 x i16> %a to <16 x i32> %val = zext <16 x i16> %a to <16 x i32>
ret <16 x i32> %val ret <16 x i32> %val
} }
define <32 x i32> @load_zext_v32i16i32(<32 x i16>* %ap) #0 { define <32 x i32> @load_zext_v32i16i32(<32 x i16>* %ap) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: load_zext_v32i16i32: ; CHECK-LABEL: load_zext_v32i16i32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #16 ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #24 ; CHECK-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: uunpklo z2.s, z0.h
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: load_zext_v32i16i32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1h { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
%a = load <32 x i16>, <32 x i16>* %ap %a = load <32 x i16>, <32 x i16>* %ap
%val = zext <32 x i16> %a to <32 x i32> %val = zext <32 x i16> %a to <32 x i32>
ret <32 x i32> %val ret <32 x i32> %val
} }
define <64 x i32> @load_zext_v64i16i32(<64 x i16>* %ap) #0 { define <64 x i32> @load_zext_v64i16i32(<64 x i16>* %ap) #0 {
; VBITS_GE_256-LABEL: load_zext_v64i16i32: ; VBITS_GE_1024-LABEL: load_zext_v64i16i32:
; VBITS_GE_256: // %bb.0: ; VBITS_GE_1024: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #16 ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_256-NEXT: mov x10, #32 ; VBITS_GE_1024-NEXT: mov x9, #32
; VBITS_GE_256-NEXT: mov x11, #48 ; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_256-NEXT: mov x12, #24 ; VBITS_GE_1024-NEXT: uunpklo z1.s, z0.h
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] ; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1] ; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x11, lsl #1] ; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0] ; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x8]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_1024-NEXT: ret
; VBITS_GE_256-NEXT: uunpklo z4.s, z0.h
; VBITS_GE_256-NEXT: uunpklo z5.s, z1.h
; VBITS_GE_256-NEXT: uunpklo z6.s, z2.h
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x8, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x8, x10, lsl #2]
; VBITS_GE_256-NEXT: mov x10, #56
; VBITS_GE_256-NEXT: mov x11, #40
; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
; VBITS_GE_256-NEXT: uunpklo z7.s, z3.h
; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h
; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x8]
; VBITS_GE_256-NEXT: ret
; ;
; VBITS_GE_2048-LABEL: load_zext_v64i16i32: ; VBITS_GE_2048-LABEL: load_zext_v64i16i32:
; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048: // %bb.0:
@ -170,7 +95,7 @@ define <64 x i32> @load_zext_v64i16i32(<64 x i16>* %ap) #0 {
ret <64 x i32> %val ret <64 x i32> %val
} }
define <4 x i32> @load_sext_v4i16i32(<4 x i16>* %ap) #0 { define <4 x i32> @load_sext_v4i16i32(<4 x i16>* %ap) vscale_range(2,0) #0 {
; CHECK-LABEL: load_sext_v4i16i32: ; CHECK-LABEL: load_sext_v4i16i32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d0, [x0]
@ -181,7 +106,7 @@ define <4 x i32> @load_sext_v4i16i32(<4 x i16>* %ap) #0 {
ret <4 x i32> %val ret <4 x i32> %val
} }
define <8 x i32> @load_sext_v8i16i32(<8 x i16>* %ap) #0 { define <8 x i32> @load_sext_v8i16i32(<8 x i16>* %ap) vscale_range(2,0) #0 {
; CHECK-LABEL: load_sext_v8i16i32: ; CHECK-LABEL: load_sext_v8i16i32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ptrue p0.s, vl8
@ -193,103 +118,43 @@ define <8 x i32> @load_sext_v8i16i32(<8 x i16>* %ap) #0 {
ret <8 x i32> %val ret <8 x i32> %val
} }
define <16 x i32> @load_sext_v16i16i32(<16 x i16>* %ap) #0 { define <16 x i32> @load_sext_v16i16i32(<16 x i16>* %ap) vscale_range(4,0) #0 {
; VBITS_GE_256-LABEL: load_sext_v16i16i32: ; CHECK-LABEL: load_sext_v16i16i32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.s, vl16
; VBITS_GE_256-NEXT: mov x9, #8 ; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: load_sext_v16i16i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalistaion
%a = load <16 x i16>, <16 x i16>* %ap %a = load <16 x i16>, <16 x i16>* %ap
%val = sext <16 x i16> %a to <16 x i32> %val = sext <16 x i16> %a to <16 x i32>
ret <16 x i32> %val ret <16 x i32> %val
} }
define <32 x i32> @load_sext_v32i16i32(<32 x i16>* %ap) #0 { define <32 x i32> @load_sext_v32i16i32(<32 x i16>* %ap) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: load_sext_v32i16i32: ; CHECK-LABEL: load_sext_v32i16i32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #16 ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #24 ; CHECK-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: sunpklo z2.s, z0.h
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: load_sext_v32i16i32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1sh { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
%a = load <32 x i16>, <32 x i16>* %ap %a = load <32 x i16>, <32 x i16>* %ap
%val = sext <32 x i16> %a to <32 x i32> %val = sext <32 x i16> %a to <32 x i32>
ret <32 x i32> %val ret <32 x i32> %val
} }
define <64 x i32> @load_sext_v64i16i32(<64 x i16>* %ap) #0 { define <64 x i32> @load_sext_v64i16i32(<64 x i16>* %ap) #0 {
; VBITS_GE_256-LABEL: load_sext_v64i16i32: ; VBITS_GE_1024-LABEL: load_sext_v64i16i32:
; VBITS_GE_256: // %bb.0: ; VBITS_GE_1024: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #16 ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_256-NEXT: mov x10, #32 ; VBITS_GE_1024-NEXT: mov x9, #32
; VBITS_GE_256-NEXT: mov x11, #48 ; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_256-NEXT: mov x12, #24 ; VBITS_GE_1024-NEXT: sunpklo z1.s, z0.h
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] ; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1] ; VBITS_GE_1024-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x11, lsl #1] ; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0] ; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x8]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_1024-NEXT: ret
; VBITS_GE_256-NEXT: sunpklo z4.s, z0.h
; VBITS_GE_256-NEXT: sunpklo z5.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z6.s, z2.h
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x8, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x8, x10, lsl #2]
; VBITS_GE_256-NEXT: mov x10, #56
; VBITS_GE_256-NEXT: mov x11, #40
; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
; VBITS_GE_256-NEXT: sunpklo z7.s, z3.h
; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x8]
; VBITS_GE_256-NEXT: ret
; ;
; VBITS_GE_2048-LABEL: load_sext_v64i16i32: ; VBITS_GE_2048-LABEL: load_sext_v64i16i32:
; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048: // %bb.0:
@ -303,52 +168,22 @@ define <64 x i32> @load_sext_v64i16i32(<64 x i16>* %ap) #0 {
} }
define <32 x i64> @load_zext_v32i8i64(<32 x i8>* %ap) #0 { define <32 x i64> @load_zext_v32i8i64(<32 x i8>* %ap) #0 {
; VBITS_GE_256-LABEL: load_zext_v32i8i64: ; VBITS_GE_1024-LABEL: load_zext_v32i8i64:
; VBITS_GE_256: // %bb.0: ; VBITS_GE_1024: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_1024-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: mov x9, #8 ; VBITS_GE_1024-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_256-NEXT: mov x10, #24 ; VBITS_GE_1024-NEXT: uunpklo z1.h, z0.b
; VBITS_GE_256-NEXT: ushll2 v2.8h, v0.16b, #0 ; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: ushll v1.8h, v0.8b, #0 ; VBITS_GE_1024-NEXT: uunpklo z0.h, z0.b
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_1024-NEXT: uunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: ext v3.16b, v2.16b, v2.16b, #8 ; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h ; VBITS_GE_1024-NEXT: uunpklo z1.d, z1.s
; VBITS_GE_256-NEXT: ushll2 v4.8h, v0.16b, #0 ; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: uunpklo z2.d, z2.s ; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3] ; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
; VBITS_GE_256-NEXT: mov x9, #12 ; VBITS_GE_1024-NEXT: ret
; VBITS_GE_256-NEXT: uunpklo z2.s, z4.h
; VBITS_GE_256-NEXT: ext v4.16b, v4.16b, v4.16b, #8
; VBITS_GE_256-NEXT: uunpklo z2.d, z2.s
; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: mov x10, #28
; VBITS_GE_256-NEXT: uunpklo z2.d, z3.s
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: ext v2.16b, v1.16b, v1.16b, #8
; VBITS_GE_256-NEXT: uunpklo z3.s, z4.h
; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0
; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: mov x10, #4
; VBITS_GE_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8
; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h
; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: uunpklo z0.d, z2.s
; VBITS_GE_256-NEXT: mov x9, #20
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: uunpklo z0.s, z1.h
; VBITS_GE_256-NEXT: uunpklo z1.s, z3.h
; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
; VBITS_GE_256-NEXT: ret
; ;
; VBITS_GE_2048-LABEL: load_zext_v32i8i64: ; VBITS_GE_2048-LABEL: load_zext_v32i8i64:
; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048: // %bb.0:
@ -362,52 +197,22 @@ define <32 x i64> @load_zext_v32i8i64(<32 x i8>* %ap) #0 {
} }
define <32 x i64> @load_sext_v32i8i64(<32 x i8>* %ap) #0 { define <32 x i64> @load_sext_v32i8i64(<32 x i8>* %ap) #0 {
; VBITS_GE_256-LABEL: load_sext_v32i8i64: ; VBITS_GE_1024-LABEL: load_sext_v32i8i64:
; VBITS_GE_256: // %bb.0: ; VBITS_GE_1024: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_1024-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: mov x9, #8 ; VBITS_GE_1024-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_256-NEXT: mov x10, #24 ; VBITS_GE_1024-NEXT: sunpklo z1.h, z0.b
; VBITS_GE_256-NEXT: sshll2 v2.8h, v0.16b, #0 ; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: sshll v1.8h, v0.8b, #0 ; VBITS_GE_1024-NEXT: sunpklo z0.h, z0.b
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_1024-NEXT: sunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: ext v3.16b, v2.16b, v2.16b, #8 ; VBITS_GE_1024-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h ; VBITS_GE_1024-NEXT: sunpklo z1.d, z1.s
; VBITS_GE_256-NEXT: sshll2 v4.8h, v0.16b, #0 ; VBITS_GE_1024-NEXT: sunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3] ; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
; VBITS_GE_256-NEXT: mov x9, #12 ; VBITS_GE_1024-NEXT: ret
; VBITS_GE_256-NEXT: sunpklo z2.s, z4.h
; VBITS_GE_256-NEXT: ext v4.16b, v4.16b, v4.16b, #8
; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: mov x10, #28
; VBITS_GE_256-NEXT: sunpklo z2.d, z3.s
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: ext v2.16b, v1.16b, v1.16b, #8
; VBITS_GE_256-NEXT: sunpklo z3.s, z4.h
; VBITS_GE_256-NEXT: sshll v0.8h, v0.8b, #0
; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: mov x10, #4
; VBITS_GE_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8
; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: sunpklo z0.d, z2.s
; VBITS_GE_256-NEXT: mov x9, #20
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: sunpklo z0.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z1.s, z3.h
; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
; VBITS_GE_256-NEXT: ret
; ;
; VBITS_GE_2048-LABEL: load_sext_v32i8i64: ; VBITS_GE_2048-LABEL: load_sext_v32i8i64:
; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048: // %bb.0:
@ -421,50 +226,20 @@ define <32 x i64> @load_sext_v32i8i64(<32 x i8>* %ap) #0 {
} }
define <32 x i64> @load_zext_v32i16i64(<32 x i16>* %ap) #0 { define <32 x i64> @load_zext_v32i16i64(<32 x i16>* %ap) #0 {
; VBITS_GE_256-LABEL: load_zext_v32i16i64: ; VBITS_GE_1024-LABEL: load_zext_v32i16i64:
; VBITS_GE_256: // %bb.0: ; VBITS_GE_1024: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #16 ; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_1024-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: mov x10, #24 ; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] ; VBITS_GE_1024-NEXT: uunpklo z1.s, z0.h
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #32
; VBITS_GE_256-NEXT: ext v2.16b, v0.16b, v0.16b, #8 ; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h ; VBITS_GE_1024-NEXT: uunpklo z1.d, z1.s
; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s ; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: ext v5.16b, v1.16b, v1.16b, #8 ; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x9, lsl #3] ; VBITS_GE_1024-NEXT: ret
; VBITS_GE_256-NEXT: mov x9, #20
; VBITS_GE_256-NEXT: uunpklo z4.s, z1.h
; VBITS_GE_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8
; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h
; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
; VBITS_GE_256-NEXT: uunpklo z2.d, z2.s
; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: ext v6.16b, v1.16b, v1.16b, #8
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: mov x10, #4
; VBITS_GE_256-NEXT: uunpklo z5.s, z5.h
; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: uunpklo z5.d, z5.s
; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
; VBITS_GE_256-NEXT: uunpklo z0.s, z3.h
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x9, #28
; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: mov x10, #12
; VBITS_GE_256-NEXT: uunpklo z2.s, z6.h
; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: uunpklo z1.d, z4.s
; VBITS_GE_256-NEXT: uunpklo z2.d, z2.s
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
; VBITS_GE_256-NEXT: ret
; ;
; VBITS_GE_2048-LABEL: load_zext_v32i16i64: ; VBITS_GE_2048-LABEL: load_zext_v32i16i64:
; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048: // %bb.0:
@ -478,50 +253,20 @@ define <32 x i64> @load_zext_v32i16i64(<32 x i16>* %ap) #0 {
} }
define <32 x i64> @load_sext_v32i16i64(<32 x i16>* %ap) #0 { define <32 x i64> @load_sext_v32i16i64(<32 x i16>* %ap) #0 {
; VBITS_GE_256-LABEL: load_sext_v32i16i64: ; VBITS_GE_1024-LABEL: load_sext_v32i16i64:
; VBITS_GE_256: // %bb.0: ; VBITS_GE_1024: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #16 ; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_1024-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: mov x10, #24 ; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] ; VBITS_GE_1024-NEXT: sunpklo z1.s, z0.h
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #32
; VBITS_GE_256-NEXT: ext v2.16b, v0.16b, v0.16b, #8 ; VBITS_GE_1024-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h ; VBITS_GE_1024-NEXT: sunpklo z1.d, z1.s
; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s ; VBITS_GE_1024-NEXT: sunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: ext v5.16b, v1.16b, v1.16b, #8 ; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x9, lsl #3] ; VBITS_GE_1024-NEXT: ret
; VBITS_GE_256-NEXT: mov x9, #20
; VBITS_GE_256-NEXT: sunpklo z4.s, z1.h
; VBITS_GE_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8
; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: ext v6.16b, v1.16b, v1.16b, #8
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: mov x10, #4
; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h
; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s
; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
; VBITS_GE_256-NEXT: sunpklo z0.s, z3.h
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x9, #28
; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: mov x10, #12
; VBITS_GE_256-NEXT: sunpklo z2.s, z6.h
; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: sunpklo z1.d, z4.s
; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
; VBITS_GE_256-NEXT: ret
; ;
; VBITS_GE_2048-LABEL: load_sext_v32i16i64: ; VBITS_GE_2048-LABEL: load_sext_v32i16i64:
; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048: // %bb.0:
@ -535,42 +280,18 @@ define <32 x i64> @load_sext_v32i16i64(<32 x i16>* %ap) #0 {
} }
define <32 x i64> @load_zext_v32i32i64(<32 x i32>* %ap) #0 { define <32 x i64> @load_zext_v32i32i64(<32 x i32>* %ap) #0 {
; VBITS_GE_256-LABEL: load_zext_v32i32i64: ; VBITS_GE_1024-LABEL: load_zext_v32i32i64:
; VBITS_GE_256: // %bb.0: ; VBITS_GE_1024: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #8 ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_256-NEXT: mov x10, #16 ; VBITS_GE_1024-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: mov x11, #24 ; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_256-NEXT: mov x12, #12 ; VBITS_GE_1024-NEXT: uunpklo z1.d, z0.s
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] ; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2] ; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2] ; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_1024-NEXT: ret
; VBITS_GE_256-NEXT: uunpklo z4.d, z0.s
; VBITS_GE_256-NEXT: uunpklo z5.d, z1.s
; VBITS_GE_256-NEXT: uunpklo z6.d, z2.s
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x8, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: mov x10, #28
; VBITS_GE_256-NEXT: mov x11, #20
; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x9, #4
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
; VBITS_GE_256-NEXT: uunpklo z7.d, z3.s
; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
; VBITS_GE_256-NEXT: uunpklo z2.d, z2.s
; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x12, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x8]
; VBITS_GE_256-NEXT: ret
; ;
; VBITS_GE_2048-LABEL: load_zext_v32i32i64: ; VBITS_GE_2048-LABEL: load_zext_v32i32i64:
; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048: // %bb.0:
@ -584,42 +305,18 @@ define <32 x i64> @load_zext_v32i32i64(<32 x i32>* %ap) #0 {
} }
define <32 x i64> @load_sext_v32i32i64(<32 x i32>* %ap) #0 { define <32 x i64> @load_sext_v32i32i64(<32 x i32>* %ap) #0 {
; VBITS_GE_256-LABEL: load_sext_v32i32i64: ; VBITS_GE_1024-LABEL: load_sext_v32i32i64:
; VBITS_GE_256: // %bb.0: ; VBITS_GE_1024: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #8 ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_256-NEXT: mov x10, #16 ; VBITS_GE_1024-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: mov x11, #24 ; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_256-NEXT: mov x12, #12 ; VBITS_GE_1024-NEXT: sunpklo z1.d, z0.s
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] ; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2] ; VBITS_GE_1024-NEXT: sunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2] ; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_1024-NEXT: ret
; VBITS_GE_256-NEXT: sunpklo z4.d, z0.s
; VBITS_GE_256-NEXT: sunpklo z5.d, z1.s
; VBITS_GE_256-NEXT: sunpklo z6.d, z2.s
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x8, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: mov x10, #28
; VBITS_GE_256-NEXT: mov x11, #20
; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x9, #4
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
; VBITS_GE_256-NEXT: sunpklo z7.d, z3.s
; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x12, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x8]
; VBITS_GE_256-NEXT: ret
; ;
; VBITS_GE_2048-LABEL: load_sext_v32i32i64: ; VBITS_GE_2048-LABEL: load_sext_v32i32i64:
; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048: // %bb.0:

View File

@ -1,28 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; Don't use SVE when its registers are no bigger than NEON.
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
target triple = "aarch64-unknown-linux-gnu" target triple = "aarch64-unknown-linux-gnu"
; i8 ; i8
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) #0 { define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v8i8: ; CHECK-LABEL: extract_subvector_v8i8:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: zip2 v0.8b, v0.8b, v0.8b ; CHECK-NEXT: zip2 v0.8b, v0.8b, v0.8b
@ -32,7 +18,7 @@ define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) #0 {
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <8 x i8> @extract_subvector_v16i8(<16 x i8> %op) #0 { define <8 x i8> @extract_subvector_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v16i8: ; CHECK-LABEL: extract_subvector_v16i8:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
@ -42,7 +28,7 @@ define <8 x i8> @extract_subvector_v16i8(<16 x i8> %op) #0 {
ret <8 x i8> %ret ret <8 x i8> %ret
} }
define void @extract_subvector_v32i8(<32 x i8>* %a, <16 x i8>* %b) #0 { define void @extract_subvector_v32i8(<32 x i8>* %a, <16 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v32i8: ; CHECK-LABEL: extract_subvector_v32i8:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: ptrue p0.b, vl32
@ -79,62 +65,30 @@ define void @extract_subvector_v64i8(<64 x i8>* %a, <32 x i8>* %b) #0 {
ret void ret void
} }
define void @extract_subvector_v128i8(<128 x i8>* %a, <64 x i8>* %b) #0 { define void @extract_subvector_v128i8(<128 x i8>* %a, <64 x i8>* %b) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v128i8: ; CHECK-LABEL: extract_subvector_v128i8:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov w8, #64 ; CHECK-NEXT: ptrue p0.b, vl128
; VBITS_GE_256-NEXT: mov w9, #96 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: ptrue p0.b, vl64
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9] ; CHECK-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_256-NEXT: mov w8, #32 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x1, x8]
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: extract_subvector_v128i8:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.b, vl128
; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.b, vl64
; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%op = load <128 x i8>, <128 x i8>* %a %op = load <128 x i8>, <128 x i8>* %a
%ret = call <64 x i8> @llvm.experimental.vector.extract.v64i8.v128i8(<128 x i8> %op, i64 64) %ret = call <64 x i8> @llvm.experimental.vector.extract.v64i8.v128i8(<128 x i8> %op, i64 64)
store <64 x i8> %ret, <64 x i8>* %b store <64 x i8> %ret, <64 x i8>* %b
ret void ret void
} }
define void @extract_subvector_v256i8(<256 x i8>* %a, <128 x i8>* %b) #0 { define void @extract_subvector_v256i8(<256 x i8>* %a, <128 x i8>* %b) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v256i8: ; CHECK-LABEL: extract_subvector_v256i8:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov w8, #128 ; CHECK-NEXT: ptrue p0.b, vl256
; VBITS_GE_256-NEXT: mov w9, #160 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov w10, #224 ; CHECK-NEXT: ptrue p0.b, vl128
; VBITS_GE_256-NEXT: mov w11, #192 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9]
; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x10]
; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0, x11]
; VBITS_GE_256-NEXT: mov w8, #64
; VBITS_GE_256-NEXT: mov w9, #96
; VBITS_GE_256-NEXT: mov w10, #32
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x1, x8]
; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x1, x9]
; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x1, x10]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: extract_subvector_v256i8:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.b, vl256
; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.b, vl128
; VBITS_GE_2048-NEXT: ext z0.b, z0.b, z0.b, #128
; VBITS_GE_2048-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%op = load <256 x i8>, <256 x i8>* %a %op = load <256 x i8>, <256 x i8>* %a
%ret = call <128 x i8> @llvm.experimental.vector.extract.v128i8.v256i8(<256 x i8> %op, i64 128) %ret = call <128 x i8> @llvm.experimental.vector.extract.v128i8.v256i8(<256 x i8> %op, i64 128)
store <128 x i8> %ret, <128 x i8>* %b store <128 x i8> %ret, <128 x i8>* %b
@ -144,7 +98,7 @@ define void @extract_subvector_v256i8(<256 x i8>* %a, <128 x i8>* %b) #0 {
; i16 ; i16
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) #0 { define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v4i16: ; CHECK-LABEL: extract_subvector_v4i16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@ -159,7 +113,7 @@ define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) #0 {
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <4 x i16> @extract_subvector_v8i16(<8 x i16> %op) #0 { define <4 x i16> @extract_subvector_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v8i16: ; CHECK-LABEL: extract_subvector_v8i16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
@ -169,7 +123,7 @@ define <4 x i16> @extract_subvector_v8i16(<8 x i16> %op) #0 {
ret <4 x i16> %ret ret <4 x i16> %ret
} }
define void @extract_subvector_v16i16(<16 x i16>* %a, <8 x i16>* %b) #0 { define void @extract_subvector_v16i16(<16 x i16>* %a, <8 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v16i16: ; CHECK-LABEL: extract_subvector_v16i16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -206,62 +160,30 @@ define void @extract_subvector_v32i16(<32 x i16>* %a, <16 x i16>* %b) #0 {
ret void ret void
} }
define void @extract_subvector_v64i16(<64 x i16>* %a, <32 x i16>* %b) #0 { define void @extract_subvector_v64i16(<64 x i16>* %a, <32 x i16>* %b) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v64i16: ; CHECK-LABEL: extract_subvector_v64i16:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #32 ; CHECK-NEXT: ptrue p0.h, vl64
; VBITS_GE_256-NEXT: mov x9, #48 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl32
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_256-NEXT: mov x8, #16 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: extract_subvector_v64i16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%op = load <64 x i16>, <64 x i16>* %a %op = load <64 x i16>, <64 x i16>* %a
%ret = call <32 x i16> @llvm.experimental.vector.extract.v32i16.v64i16(<64 x i16> %op, i64 32) %ret = call <32 x i16> @llvm.experimental.vector.extract.v32i16.v64i16(<64 x i16> %op, i64 32)
store <32 x i16> %ret, <32 x i16>* %b store <32 x i16> %ret, <32 x i16>* %b
ret void ret void
} }
define void @extract_subvector_v128i16(<128 x i16>* %a, <64 x i16>* %b) #0 { define void @extract_subvector_v128i16(<128 x i16>* %a, <64 x i16>* %b) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v128i16: ; CHECK-LABEL: extract_subvector_v128i16:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #64 ; CHECK-NEXT: ptrue p0.h, vl128
; VBITS_GE_256-NEXT: mov x9, #80 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #112 ; CHECK-NEXT: ptrue p0.h, vl64
; VBITS_GE_256-NEXT: mov x11, #96 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1]
; VBITS_GE_256-NEXT: mov x8, #32
; VBITS_GE_256-NEXT: mov x9, #48
; VBITS_GE_256-NEXT: mov x10, #16
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: extract_subvector_v128i16:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.h, vl64
; VBITS_GE_2048-NEXT: ext z0.b, z0.b, z0.b, #128
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%op = load <128 x i16>, <128 x i16>* %a %op = load <128 x i16>, <128 x i16>* %a
%ret = call <64 x i16> @llvm.experimental.vector.extract.v64i16.v128i16(<128 x i16> %op, i64 64) %ret = call <64 x i16> @llvm.experimental.vector.extract.v64i16.v128i16(<128 x i16> %op, i64 64)
store <64 x i16> %ret, <64 x i16>* %b store <64 x i16> %ret, <64 x i16>* %b
@ -271,7 +193,7 @@ define void @extract_subvector_v128i16(<128 x i16>* %a, <64 x i16>* %b) #0 {
; i32 ; i32
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) #0 { define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v2i32: ; CHECK-LABEL: extract_subvector_v2i32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@ -282,7 +204,7 @@ define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) #0 {
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <2 x i32> @extract_subvector_v4i32(<4 x i32> %op) #0 { define <2 x i32> @extract_subvector_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v4i32: ; CHECK-LABEL: extract_subvector_v4i32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
@ -292,7 +214,7 @@ define <2 x i32> @extract_subvector_v4i32(<4 x i32> %op) #0 {
ret <2 x i32> %ret ret <2 x i32> %ret
} }
define void @extract_subvector_v8i32(<8 x i32>* %a, <4 x i32>* %b) #0 { define void @extract_subvector_v8i32(<8 x i32>* %a, <4 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v8i32: ; CHECK-LABEL: extract_subvector_v8i32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ptrue p0.s, vl8
@ -329,62 +251,30 @@ define void @extract_subvector_v16i32(<16 x i32>* %a, <8 x i32>* %b) #0 {
ret void ret void
} }
define void @extract_subvector_v32i32(<32 x i32>* %a, <16 x i32>* %b) #0 { define void @extract_subvector_v32i32(<32 x i32>* %a, <16 x i32>* %b) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v32i32: ; CHECK-LABEL: extract_subvector_v32i32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16 ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_256-NEXT: mov x9, #24 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ptrue p0.s, vl16
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: mov x8, #8 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: extract_subvector_v32i32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%op = load <32 x i32>, <32 x i32>* %a %op = load <32 x i32>, <32 x i32>* %a
%ret = call <16 x i32> @llvm.experimental.vector.extract.v16i32.v32i32(<32 x i32> %op, i64 16) %ret = call <16 x i32> @llvm.experimental.vector.extract.v16i32.v32i32(<32 x i32> %op, i64 16)
store <16 x i32> %ret, <16 x i32>* %b store <16 x i32> %ret, <16 x i32>* %b
ret void ret void
} }
define void @extract_subvector_v64i32(<64 x i32>* %a, <32 x i32>* %b) #0 { define void @extract_subvector_v64i32(<64 x i32>* %a, <32 x i32>* %b) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v64i32: ; CHECK-LABEL: extract_subvector_v64i32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #32 ; CHECK-NEXT: ptrue p0.s, vl64
; VBITS_GE_256-NEXT: mov x9, #40 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #56 ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_256-NEXT: mov x11, #48 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: mov x9, #24
; VBITS_GE_256-NEXT: mov x10, #8
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: extract_subvector_v64i32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
; VBITS_GE_2048-NEXT: ext z0.b, z0.b, z0.b, #128
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%op = load <64 x i32>, <64 x i32>* %a %op = load <64 x i32>, <64 x i32>* %a
%ret = call <32 x i32> @llvm.experimental.vector.extract.v32i32.v64i32(<64 x i32> %op, i64 32) %ret = call <32 x i32> @llvm.experimental.vector.extract.v32i32.v64i32(<64 x i32> %op, i64 32)
store <32 x i32> %ret, <32 x i32>* %b store <32 x i32> %ret, <32 x i32>* %b
@ -394,7 +284,7 @@ define void @extract_subvector_v64i32(<64 x i32>* %a, <32 x i32>* %b) #0 {
; i64 ; i64
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <1 x i64> @extract_subvector_v2i64(<2 x i64> %op) #0 { define <1 x i64> @extract_subvector_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v2i64: ; CHECK-LABEL: extract_subvector_v2i64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
@ -404,7 +294,7 @@ define <1 x i64> @extract_subvector_v2i64(<2 x i64> %op) #0 {
ret <1 x i64> %ret ret <1 x i64> %ret
} }
define void @extract_subvector_v4i64(<4 x i64>* %a, <2 x i64>* %b) #0 { define void @extract_subvector_v4i64(<4 x i64>* %a, <2 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v4i64: ; CHECK-LABEL: extract_subvector_v4i64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ptrue p0.d, vl4
@ -418,23 +308,14 @@ define void @extract_subvector_v4i64(<4 x i64>* %a, <2 x i64>* %b) #0 {
ret void ret void
} }
define void @extract_subvector_v8i64(<8 x i64>* %a, <4 x i64>* %b) #0 { define void @extract_subvector_v8i64(<8 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v8i64: ; CHECK-LABEL: extract_subvector_v8i64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4 ; CHECK-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret ; CHECK-NEXT: ret
;
; VBITS_GE_512-LABEL: extract_subvector_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ptrue p0.d, vl4
; VBITS_GE_512-NEXT: ext z0.b, z0.b, z0.b, #32
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%op = load <8 x i64>, <8 x i64>* %a %op = load <8 x i64>, <8 x i64>* %a
%ret = call <4 x i64> @llvm.experimental.vector.extract.v4i64.v8i64(<8 x i64> %op, i64 4) %ret = call <4 x i64> @llvm.experimental.vector.extract.v4i64.v8i64(<8 x i64> %op, i64 4)
store <4 x i64> %ret, <4 x i64>* %b store <4 x i64> %ret, <4 x i64>* %b
@ -453,50 +334,20 @@ define void @extract_subvector_v16i64(<16 x i64>* %a, <8 x i64>* %b) #0 {
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret ; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: extract_subvector_v16i64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.d, vl8
; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%op = load <16 x i64>, <16 x i64>* %a %op = load <16 x i64>, <16 x i64>* %a
%ret = call <8 x i64> @llvm.experimental.vector.extract.v8i64.v16i64(<16 x i64> %op, i64 8) %ret = call <8 x i64> @llvm.experimental.vector.extract.v8i64.v16i64(<16 x i64> %op, i64 8)
store <8 x i64> %ret, <8 x i64>* %b store <8 x i64> %ret, <8 x i64>* %b
ret void ret void
} }
define void @extract_subvector_v32i64(<32 x i64>* %a, <16 x i64>* %b) #0 { define void @extract_subvector_v32i64(<32 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v32i64: ; CHECK-LABEL: extract_subvector_v32i64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16 ; CHECK-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: mov x9, #20 ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_256-NEXT: mov x10, #28 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: mov x11, #24 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: mov x9, #12
; VBITS_GE_256-NEXT: mov x10, #4
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: extract_subvector_v32i64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.d, vl16
; VBITS_GE_2048-NEXT: ext z0.b, z0.b, z0.b, #128
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%op = load <32 x i64>, <32 x i64>* %a %op = load <32 x i64>, <32 x i64>* %a
%ret = call <16 x i64> @llvm.experimental.vector.extract.v16i64.v32i64(<32 x i64> %op, i64 16) %ret = call <16 x i64> @llvm.experimental.vector.extract.v16i64.v32i64(<32 x i64> %op, i64 16)
store <16 x i64> %ret, <16 x i64>* %b store <16 x i64> %ret, <16 x i64>* %b
@ -506,7 +357,7 @@ define void @extract_subvector_v32i64(<32 x i64>* %a, <16 x i64>* %b) #0 {
; f16 ; f16
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <2 x half> @extract_subvector_v4f16(<4 x half> %op) #0 { define <2 x half> @extract_subvector_v4f16(<4 x half> %op) vscale_range(16,0) #0 {
; CHECK-LABEL: extract_subvector_v4f16: ; CHECK-LABEL: extract_subvector_v4f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@ -517,7 +368,7 @@ define <2 x half> @extract_subvector_v4f16(<4 x half> %op) #0 {
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <4 x half> @extract_subvector_v8f16(<8 x half> %op) #0 { define <4 x half> @extract_subvector_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v8f16: ; CHECK-LABEL: extract_subvector_v8f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
@ -527,7 +378,7 @@ define <4 x half> @extract_subvector_v8f16(<8 x half> %op) #0 {
ret <4 x half> %ret ret <4 x half> %ret
} }
define void @extract_subvector_v16f16(<16 x half>* %a, <8 x half>* %b) #0 { define void @extract_subvector_v16f16(<16 x half>* %a, <8 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v16f16: ; CHECK-LABEL: extract_subvector_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -564,62 +415,30 @@ define void @extract_subvector_v32f16(<32 x half>* %a, <16 x half>* %b) #0 {
ret void ret void
} }
define void @extract_subvector_v64f16(<64 x half>* %a, <32 x half>* %b) #0 { define void @extract_subvector_v64f16(<64 x half>* %a, <32 x half>* %b) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v64f16: ; CHECK-LABEL: extract_subvector_v64f16:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #32 ; CHECK-NEXT: ptrue p0.h, vl64
; VBITS_GE_256-NEXT: mov x9, #48 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl32
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_256-NEXT: mov x8, #16 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: extract_subvector_v64f16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%op = load <64 x half>, <64 x half>* %a %op = load <64 x half>, <64 x half>* %a
%ret = call <32 x half> @llvm.experimental.vector.extract.v32f16.v64f16(<64 x half> %op, i64 32) %ret = call <32 x half> @llvm.experimental.vector.extract.v32f16.v64f16(<64 x half> %op, i64 32)
store <32 x half> %ret, <32 x half>* %b store <32 x half> %ret, <32 x half>* %b
ret void ret void
} }
define void @extract_subvector_v128f16(<128 x half>* %a, <64 x half>* %b) #0 { define void @extract_subvector_v128f16(<128 x half>* %a, <64 x half>* %b) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v128f16: ; CHECK-LABEL: extract_subvector_v128f16:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #64 ; CHECK-NEXT: ptrue p0.h, vl128
; VBITS_GE_256-NEXT: mov x9, #80 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #112 ; CHECK-NEXT: ptrue p0.h, vl64
; VBITS_GE_256-NEXT: mov x11, #96 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1]
; VBITS_GE_256-NEXT: mov x8, #32
; VBITS_GE_256-NEXT: mov x9, #48
; VBITS_GE_256-NEXT: mov x10, #16
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: extract_subvector_v128f16:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.h, vl64
; VBITS_GE_2048-NEXT: ext z0.b, z0.b, z0.b, #128
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%op = load <128 x half>, <128 x half>* %a %op = load <128 x half>, <128 x half>* %a
%ret = call <64 x half> @llvm.experimental.vector.extract.v64f16.v128f16(<128 x half> %op, i64 64) %ret = call <64 x half> @llvm.experimental.vector.extract.v64f16.v128f16(<128 x half> %op, i64 64)
store <64 x half> %ret, <64 x half>* %b store <64 x half> %ret, <64 x half>* %b
@ -629,7 +448,7 @@ define void @extract_subvector_v128f16(<128 x half>* %a, <64 x half>* %b) #0 {
; f32 ; f32
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <1 x float> @extract_subvector_v2f32(<2 x float> %op) #0 { define <1 x float> @extract_subvector_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v2f32: ; CHECK-LABEL: extract_subvector_v2f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@ -640,7 +459,7 @@ define <1 x float> @extract_subvector_v2f32(<2 x float> %op) #0 {
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <2 x float> @extract_subvector_v4f32(<4 x float> %op) #0 { define <2 x float> @extract_subvector_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v4f32: ; CHECK-LABEL: extract_subvector_v4f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
@ -650,7 +469,7 @@ define <2 x float> @extract_subvector_v4f32(<4 x float> %op) #0 {
ret <2 x float> %ret ret <2 x float> %ret
} }
define void @extract_subvector_v8f32(<8 x float>* %a, <4 x float>* %b) #0 { define void @extract_subvector_v8f32(<8 x float>* %a, <4 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v8f32: ; CHECK-LABEL: extract_subvector_v8f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ptrue p0.s, vl8
@ -687,62 +506,30 @@ define void @extract_subvector_v16f32(<16 x float>* %a, <8 x float>* %b) #0 {
ret void ret void
} }
define void @extract_subvector_v32f32(<32 x float>* %a, <16 x float>* %b) #0 { define void @extract_subvector_v32f32(<32 x float>* %a, <16 x float>* %b) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v32f32: ; CHECK-LABEL: extract_subvector_v32f32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16 ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_256-NEXT: mov x9, #24 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ptrue p0.s, vl16
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: mov x8, #8 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: extract_subvector_v32f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%op = load <32 x float>, <32 x float>* %a %op = load <32 x float>, <32 x float>* %a
%ret = call <16 x float> @llvm.experimental.vector.extract.v16f32.v32f32(<32 x float> %op, i64 16) %ret = call <16 x float> @llvm.experimental.vector.extract.v16f32.v32f32(<32 x float> %op, i64 16)
store <16 x float> %ret, <16 x float>* %b store <16 x float> %ret, <16 x float>* %b
ret void ret void
} }
define void @extract_subvector_v64f32(<64 x float>* %a, <32 x float>* %b) #0 { define void @extract_subvector_v64f32(<64 x float>* %a, <32 x float>* %b) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v64f32: ; CHECK-LABEL: extract_subvector_v64f32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #32 ; CHECK-NEXT: ptrue p0.s, vl64
; VBITS_GE_256-NEXT: mov x9, #40 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #56 ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_256-NEXT: mov x11, #48 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: mov x9, #24
; VBITS_GE_256-NEXT: mov x10, #8
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: extract_subvector_v64f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
; VBITS_GE_2048-NEXT: ext z0.b, z0.b, z0.b, #128
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%op = load <64 x float>, <64 x float>* %a %op = load <64 x float>, <64 x float>* %a
%ret = call <32 x float> @llvm.experimental.vector.extract.v32f32.v64f32(<64 x float> %op, i64 32) %ret = call <32 x float> @llvm.experimental.vector.extract.v32f32.v64f32(<64 x float> %op, i64 32)
store <32 x float> %ret, <32 x float>* %b store <32 x float> %ret, <32 x float>* %b
@ -752,7 +539,7 @@ define void @extract_subvector_v64f32(<64 x float>* %a, <32 x float>* %b) #0 {
; f64 ; f64
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <1 x double> @extract_subvector_v2f64(<2 x double> %op) #0 { define <1 x double> @extract_subvector_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v2f64: ; CHECK-LABEL: extract_subvector_v2f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
@ -762,7 +549,7 @@ define <1 x double> @extract_subvector_v2f64(<2 x double> %op) #0 {
ret <1 x double> %ret ret <1 x double> %ret
} }
define void @extract_subvector_v4f64(<4 x double>* %a, <2 x double>* %b) #0 { define void @extract_subvector_v4f64(<4 x double>* %a, <2 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v4f64: ; CHECK-LABEL: extract_subvector_v4f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ptrue p0.d, vl4
@ -799,62 +586,30 @@ define void @extract_subvector_v8f64(<8 x double>* %a, <4 x double>* %b) #0 {
ret void ret void
} }
define void @extract_subvector_v16f64(<16 x double>* %a, <8 x double>* %b) #0 { define void @extract_subvector_v16f64(<16 x double>* %a, <8 x double>* %b) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v16f64: ; CHECK-LABEL: extract_subvector_v16f64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8 ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_256-NEXT: mov x9, #12 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ptrue p0.d, vl8
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: mov x8, #4 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: extract_subvector_v16f64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.d, vl8
; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%op = load <16 x double>, <16 x double>* %a %op = load <16 x double>, <16 x double>* %a
%ret = call <8 x double> @llvm.experimental.vector.extract.v8f64.v16f64(<16 x double> %op, i64 8) %ret = call <8 x double> @llvm.experimental.vector.extract.v8f64.v16f64(<16 x double> %op, i64 8)
store <8 x double> %ret, <8 x double>* %b store <8 x double> %ret, <8 x double>* %b
ret void ret void
} }
define void @extract_subvector_v32f64(<32 x double>* %a, <16 x double>* %b) #0 { define void @extract_subvector_v32f64(<32 x double>* %a, <16 x double>* %b) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: extract_subvector_v32f64: ; CHECK-LABEL: extract_subvector_v32f64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16 ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_256-NEXT: mov x9, #20 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #28 ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_256-NEXT: mov x11, #24 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: mov x9, #12
; VBITS_GE_256-NEXT: mov x10, #4
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: extract_subvector_v32f64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.d, vl16
; VBITS_GE_2048-NEXT: ext z0.b, z0.b, z0.b, #128
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%op = load <32 x double>, <32 x double>* %a %op = load <32 x double>, <32 x double>* %a
%ret = call <16 x double> @llvm.experimental.vector.extract.v16f64.v32f64(<32 x double> %op, i64 16) %ret = call <16 x double> @llvm.experimental.vector.extract.v16f64.v32f64(<32 x double> %op, i64 16)
store <16 x double> %ret, <16 x double>* %b store <16 x double> %ret, <16 x double>* %b

View File

@ -1,221 +1,259 @@
; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
target triple = "aarch64-unknown-linux-gnu" target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE when its registers are no bigger than NEON.
; NO_SVE-NOT: ptrue
; ;
; extractelement ; extractelement
; ;
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define half @extractelement_v4f16(<4 x half> %op1) #0 { define half @extractelement_v4f16(<4 x half> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v4f16: ; CHECK-LABEL: extractelement_v4f16:
; CHECK: mov h0, v0.h[3] ; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov h0, v0.h[3]
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%r = extractelement <4 x half> %op1, i64 3 %r = extractelement <4 x half> %op1, i64 3
ret half %r ret half %r
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define half @extractelement_v8f16(<8 x half> %op1) #0 { define half @extractelement_v8f16(<8 x half> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v8f16: ; CHECK-LABEL: extractelement_v8f16:
; CHECK: mov h0, v0.h[7] ; CHECK: // %bb.0:
; CHECK-NEXT: mov h0, v0.h[7]
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%r = extractelement <8 x half> %op1, i64 7 %r = extractelement <8 x half> %op1, i64 7
ret half %r ret half %r
} }
define half @extractelement_v16f16(<16 x half>* %a) #0 { define half @extractelement_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v16f16: ; CHECK-LABEL: extractelement_v16f16:
; VBITS_GE_256: ptrue p0.h, vl16 ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: mov z0.h, z0.h[15] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ret ; CHECK-NEXT: mov z0.h, z0.h[15]
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
; CHECK-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a %op1 = load <16 x half>, <16 x half>* %a
%r = extractelement <16 x half> %op1, i64 15 %r = extractelement <16 x half> %op1, i64 15
ret half %r ret half %r
} }
define half @extractelement_v32f16(<32 x half>* %a) #0 { define half @extractelement_v32f16(<32 x half>* %a) #0 {
; CHECK-LABEL: extractelement_v32f16: ; VBITS_GE_256-LABEL: extractelement_v32f16:
; VBITS_GE_512: ptrue p0.h, vl32 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: mov z0.h, z0.h[15]
; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 killed $z0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: extractelement_v32f16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: mov z0.h, z0.h[31] ; VBITS_GE_512-NEXT: mov z0.h, z0.h[31]
; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 killed $z0
; VBITS_GE_512-NEXT: ret ; VBITS_GE_512-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a %op1 = load <32 x half>, <32 x half>* %a
%r = extractelement <32 x half> %op1, i64 31 %r = extractelement <32 x half> %op1, i64 31
ret half %r ret half %r
} }
define half @extractelement_v64f16(<64 x half>* %a) #0 { define half @extractelement_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: extractelement_v64f16: ; CHECK-LABEL: extractelement_v64f16:
; VBITS_GE_1024: ptrue p0.h, vl64 ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: mov w8, #63 ; CHECK-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: mov w8, #63
; VBITS_GE_1024-NEXT: whilels p0.h, xzr, x8 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: lastb h0, p0, z0.h ; CHECK-NEXT: whilels p0.h, xzr, x8
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: lastb h0, p0, z0.h
; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a %op1 = load <64 x half>, <64 x half>* %a
%r = extractelement <64 x half> %op1, i64 63 %r = extractelement <64 x half> %op1, i64 63
ret half %r ret half %r
} }
define half @extractelement_v128f16(<128 x half>* %a) #0 { define half @extractelement_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: extractelement_v128f16: ; CHECK-LABEL: extractelement_v128f16:
; VBITS_GE_2048: ptrue p0.h, vl128 ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: mov w8, #127 ; CHECK-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: mov w8, #127
; VBITS_GE_2048-NEXT: whilels p0.h, xzr, x8 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: lastb h0, p0, z0.h ; CHECK-NEXT: whilels p0.h, xzr, x8
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: lastb h0, p0, z0.h
; CHECK-NEXT: ret
%op1 = load <128 x half>, <128 x half>* %a %op1 = load <128 x half>, <128 x half>* %a
%r = extractelement <128 x half> %op1, i64 127 %r = extractelement <128 x half> %op1, i64 127
ret half %r ret half %r
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define float @extractelement_v2f32(<2 x float> %op1) #0 { define float @extractelement_v2f32(<2 x float> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v2f32: ; CHECK-LABEL: extractelement_v2f32:
; CHECK: mov s0, v0.s[1] ; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov s0, v0.s[1]
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%r = extractelement <2 x float> %op1, i64 1 %r = extractelement <2 x float> %op1, i64 1
ret float %r ret float %r
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define float @extractelement_v4f32(<4 x float> %op1) #0 { define float @extractelement_v4f32(<4 x float> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v4f32: ; CHECK-LABEL: extractelement_v4f32:
; CHECK: mov s0, v0.s[3] ; CHECK: // %bb.0:
; CHECK-NEXT: mov s0, v0.s[3]
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%r = extractelement <4 x float> %op1, i64 3 %r = extractelement <4 x float> %op1, i64 3
ret float %r ret float %r
} }
define float @extractelement_v8f32(<8 x float>* %a) #0 { define float @extractelement_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v8f32: ; CHECK-LABEL: extractelement_v8f32:
; VBITS_GE_256: ptrue p0.s, vl8 ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: mov z0.s, z0.s[7] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ret ; CHECK-NEXT: mov z0.s, z0.s[7]
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
; CHECK-NEXT: ret
%op1 = load <8 x float>, <8 x float>* %a %op1 = load <8 x float>, <8 x float>* %a
%r = extractelement <8 x float> %op1, i64 7 %r = extractelement <8 x float> %op1, i64 7
ret float %r ret float %r
} }
define float @extractelement_v16f32(<16 x float>* %a) #0 { define float @extractelement_v16f32(<16 x float>* %a) #0 {
; CHECK-LABEL: extractelement_v16f32: ; VBITS_GE_256-LABEL: extractelement_v16f32:
; VBITS_GE_512: ptrue p0.s, vl16 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: mov z0.s, z0.s[7]
; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 killed $z0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: extractelement_v16f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: mov z0.s, z0.s[15] ; VBITS_GE_512-NEXT: mov z0.s, z0.s[15]
; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 killed $z0
; VBITS_GE_512-NEXT: ret ; VBITS_GE_512-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a %op1 = load <16 x float>, <16 x float>* %a
%r = extractelement <16 x float> %op1, i64 15 %r = extractelement <16 x float> %op1, i64 15
ret float %r ret float %r
} }
define float @extractelement_v32f32(<32 x float>* %a) #0 { define float @extractelement_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: extractelement_v32f32: ; CHECK-LABEL: extractelement_v32f32:
; VBITS_GE_1024: ptrue p0.s, vl32 ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: mov w8, #31 ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: mov w8, #31
; VBITS_GE_1024-NEXT: whilels p0.s, xzr, x8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: lastb s0, p0, z0.s ; CHECK-NEXT: whilels p0.s, xzr, x8
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: lastb s0, p0, z0.s
; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a %op1 = load <32 x float>, <32 x float>* %a
%r = extractelement <32 x float> %op1, i64 31 %r = extractelement <32 x float> %op1, i64 31
ret float %r ret float %r
} }
define float @extractelement_v64f32(<64 x float>* %a) #0 { define float @extractelement_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: extractelement_v64f32: ; CHECK-LABEL: extractelement_v64f32:
; VBITS_GE_2048: ptrue p0.s, vl64 ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: mov w8, #63 ; CHECK-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: mov w8, #63
; VBITS_GE_2048-NEXT: whilels p0.s, xzr, x8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: lastb s0, p0, z0.s ; CHECK-NEXT: whilels p0.s, xzr, x8
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: lastb s0, p0, z0.s
; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a %op1 = load <64 x float>, <64 x float>* %a
%r = extractelement <64 x float> %op1, i64 63 %r = extractelement <64 x float> %op1, i64 63
ret float %r ret float %r
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define double @extractelement_v1f64(<1 x double> %op1) #0 { define double @extractelement_v1f64(<1 x double> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v1f64: ; CHECK-LABEL: extractelement_v1f64:
; CHECK: ret ; CHECK: // %bb.0:
; CHECK-NEXT: ret
%r = extractelement <1 x double> %op1, i64 0 %r = extractelement <1 x double> %op1, i64 0
ret double %r ret double %r
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define double @extractelement_v2f64(<2 x double> %op1) #0 { define double @extractelement_v2f64(<2 x double> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v2f64: ; CHECK-LABEL: extractelement_v2f64:
; CHECK: mov d0, v0.d[1] ; CHECK: // %bb.0:
; CHECK-NEXT: mov d0, v0.d[1]
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%r = extractelement <2 x double> %op1, i64 1 %r = extractelement <2 x double> %op1, i64 1
ret double %r ret double %r
} }
define double @extractelement_v4f64(<4 x double>* %a) #0 { define double @extractelement_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: extractelement_v4f64: ; CHECK-LABEL: extractelement_v4f64:
; VBITS_GE_256: ptrue p0.d, vl4 ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: mov z0.d, z0.d[3] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ret ; CHECK-NEXT: mov z0.d, z0.d[3]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%op1 = load <4 x double>, <4 x double>* %a %op1 = load <4 x double>, <4 x double>* %a
%r = extractelement <4 x double> %op1, i64 3 %r = extractelement <4 x double> %op1, i64 3
ret double %r ret double %r
} }
define double @extractelement_v8f64(<8 x double>* %a) #0 { define double @extractelement_v8f64(<8 x double>* %a) #0 {
; CHECK-LABEL: extractelement_v8f64: ; VBITS_GE_256-LABEL: extractelement_v8f64:
; VBITS_GE_512: ptrue p0.d, vl8 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: mov z0.d, z0.d[3]
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: extractelement_v8f64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: mov z0.d, z0.d[7] ; VBITS_GE_512-NEXT: mov z0.d, z0.d[7]
; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0
; VBITS_GE_512-NEXT: ret ; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a %op1 = load <8 x double>, <8 x double>* %a
%r = extractelement <8 x double> %op1, i64 7 %r = extractelement <8 x double> %op1, i64 7
ret double %r ret double %r
} }
define double @extractelement_v16f64(<16 x double>* %a) #0 { define double @extractelement_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: extractelement_v16f64: ; CHECK-LABEL: extractelement_v16f64:
; VBITS_GE_1024: ptrue p0.d, vl16 ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: mov w8, #15 ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: mov w8, #15
; VBITS_GE_1024-NEXT: whilels p0.d, xzr, x8 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: lastb d0, p0, z0.d ; CHECK-NEXT: whilels p0.d, xzr, x8
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: lastb d0, p0, z0.d
; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a %op1 = load <16 x double>, <16 x double>* %a
%r = extractelement <16 x double> %op1, i64 15 %r = extractelement <16 x double> %op1, i64 15
ret double %r ret double %r
} }
define double @extractelement_v32f64(<32 x double>* %a) #0 { define double @extractelement_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: extractelement_v32f64: ; CHECK-LABEL: extractelement_v32f64:
; VBITS_GE_2048: ptrue p0.d, vl32 ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: mov w8, #31 ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: mov w8, #31
; VBITS_GE_2048-NEXT: whilels p0.d, xzr, x8 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: lastb d0, p0, z0.d ; CHECK-NEXT: whilels p0.d, xzr, x8
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: lastb d0, p0, z0.d
; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a %op1 = load <32 x double>, <32 x double>* %a
%r = extractelement <32 x double> %op1, i64 31 %r = extractelement <32 x double> %op1, i64 31
ret double %r ret double %r

File diff suppressed because it is too large Load Diff

View File

@ -1,21 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; Don't use SVE when its registers are no bigger than NEON.
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep 'z[0-9]'
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
target triple = "aarch64-unknown-linux-gnu" target triple = "aarch64-unknown-linux-gnu"
@ -24,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
; ;
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) #0 { define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v4f16: ; CHECK-LABEL: fcmp_oeq_v4f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: fcmeq v0.4h, v0.4h, v1.4h ; CHECK-NEXT: fcmeq v0.4h, v0.4h, v1.4h
@ -35,7 +21,7 @@ define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) #0 { define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v8f16: ; CHECK-LABEL: fcmp_oeq_v8f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: fcmeq v0.8h, v0.8h, v1.8h ; CHECK-NEXT: fcmeq v0.8h, v0.8h, v1.8h
@ -45,7 +31,7 @@ define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
ret <8 x i16> %sext ret <8 x i16> %sext
} }
define void @fcmp_oeq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { define void @fcmp_oeq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v16f16: ; CHECK-LABEL: fcmp_oeq_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -64,7 +50,6 @@ define void @fcmp_oeq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
} }
define void @fcmp_oeq_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x i16>* %c) #0 { define void @fcmp_oeq_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x i16>* %c) #0 {
; Ensure sensible type legalisation
; VBITS_GE_256-LABEL: fcmp_oeq_v32f16: ; VBITS_GE_256-LABEL: fcmp_oeq_v32f16:
; VBITS_GE_256: // %bb.0: ; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16 ; VBITS_GE_256-NEXT: mov x8, #16
@ -98,44 +83,16 @@ define void @fcmp_oeq_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x i16>* %c) #
ret void ret void
} }
define void @fcmp_oeq_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x i16>* %c) #0 { define void @fcmp_oeq_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x i16>* %c) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: fcmp_oeq_v64f16: ; CHECK-LABEL: fcmp_oeq_v64f16:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #32 ; CHECK-NEXT: ptrue p0.h, vl64
; VBITS_GE_256-NEXT: mov x9, #48 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #16 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] ; CHECK-NEXT: st1h { z0.h }, p0, [x2]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z2.h, z5.h
; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z1.h, z4.h
; VBITS_GE_256-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z0.h, z6.h
; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z3.h, z7.h
; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x2, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: fcmp_oeq_v64f16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
; VBITS_GE_1024-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x2]
; VBITS_GE_1024-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a %op1 = load <64 x half>, <64 x half>* %a
%op2 = load <64 x half>, <64 x half>* %b %op2 = load <64 x half>, <64 x half>* %b
%cmp = fcmp oeq <64 x half> %op1, %op2 %cmp = fcmp oeq <64 x half> %op1, %op2
@ -144,68 +101,16 @@ define void @fcmp_oeq_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x i16>* %c) #
ret void ret void
} }
define void @fcmp_oeq_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x i16>* %c) #0 { define void @fcmp_oeq_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x i16>* %c) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: fcmp_oeq_v128f16: ; CHECK-LABEL: fcmp_oeq_v128f16:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #96 ; CHECK-NEXT: ptrue p0.h, vl128
; VBITS_GE_256-NEXT: mov x9, #112 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #64 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: mov x11, #80 ; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
; VBITS_GE_256-NEXT: mov x12, #32 ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov x13, #48 ; CHECK-NEXT: st1h { z0.h }, p0, [x2]
; VBITS_GE_256-NEXT: mov x14, #16 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x0, x14, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z16.h }, p0/z, [x1, x13, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z17.h }, p0/z, [x1, x14, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z20.h }, p0/z, [x1, x11, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z21.h }, p0/z, [x1, x12, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z19.h }, p0/z, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z22.h }, p0/z, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z18.h }, p0/z, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z23.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z6.h, z17.h
; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z5.h, z16.h
; VBITS_GE_256-NEXT: mov z5.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z6.h, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z4.h, z21.h
; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z3.h, z20.h
; VBITS_GE_256-NEXT: mov z3.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z4.h, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z2.h, z22.h
; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z1.h, z19.h
; VBITS_GE_256-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z0.h, z18.h
; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z7.h, z23.h
; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z7.h, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x2, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x2, x11, lsl #1]
; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x2, x12, lsl #1]
; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x2, x13, lsl #1]
; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x2, x14, lsl #1]
; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: fcmp_oeq_v128f16:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
; VBITS_GE_2048-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x2]
; VBITS_GE_2048-NEXT: ret
%op1 = load <128 x half>, <128 x half>* %a %op1 = load <128 x half>, <128 x half>* %a
%op2 = load <128 x half>, <128 x half>* %b %op2 = load <128 x half>, <128 x half>* %b
%cmp = fcmp oeq <128 x half> %op1, %op2 %cmp = fcmp oeq <128 x half> %op1, %op2
@ -215,7 +120,7 @@ define void @fcmp_oeq_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x i16>* %
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <2 x i32> @fcmp_oeq_v2f32(<2 x float> %op1, <2 x float> %op2) #0 { define <2 x i32> @fcmp_oeq_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v2f32: ; CHECK-LABEL: fcmp_oeq_v2f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: fcmeq v0.2s, v0.2s, v1.2s ; CHECK-NEXT: fcmeq v0.2s, v0.2s, v1.2s
@ -226,7 +131,7 @@ define <2 x i32> @fcmp_oeq_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) #0 { define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v4f32: ; CHECK-LABEL: fcmp_oeq_v4f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: fcmeq v0.4s, v0.4s, v1.4s ; CHECK-NEXT: fcmeq v0.4s, v0.4s, v1.4s
@ -236,7 +141,7 @@ define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
ret <4 x i32> %sext ret <4 x i32> %sext
} }
define void @fcmp_oeq_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x i32>* %c) #0 { define void @fcmp_oeq_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x i32>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v8f32: ; CHECK-LABEL: fcmp_oeq_v8f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ptrue p0.s, vl8
@ -255,7 +160,6 @@ define void @fcmp_oeq_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x i32>* %c) #0
} }
define void @fcmp_oeq_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x i32>* %c) #0 { define void @fcmp_oeq_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x i32>* %c) #0 {
; Ensure sensible type legalisation
; VBITS_GE_256-LABEL: fcmp_oeq_v16f32: ; VBITS_GE_256-LABEL: fcmp_oeq_v16f32:
; VBITS_GE_256: // %bb.0: ; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8 ; VBITS_GE_256-NEXT: mov x8, #8
@ -289,44 +193,16 @@ define void @fcmp_oeq_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x i32>* %c)
ret void ret void
} }
define void @fcmp_oeq_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x i32>* %c) #0 { define void @fcmp_oeq_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x i32>* %c) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: fcmp_oeq_v32f32: ; CHECK-LABEL: fcmp_oeq_v32f32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16 ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_256-NEXT: mov x9, #24 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #8 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] ; CHECK-NEXT: st1w { z0.s }, p0, [x2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z2.s, z5.s
; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z4.s
; VBITS_GE_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z2.s, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z6.s
; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, z7.s
; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x2, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: fcmp_oeq_v32f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_1024-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
; VBITS_GE_1024-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x2]
; VBITS_GE_1024-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a %op1 = load <32 x float>, <32 x float>* %a
%op2 = load <32 x float>, <32 x float>* %b %op2 = load <32 x float>, <32 x float>* %b
%cmp = fcmp oeq <32 x float> %op1, %op2 %cmp = fcmp oeq <32 x float> %op1, %op2
@ -335,68 +211,16 @@ define void @fcmp_oeq_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x i32>* %c)
ret void ret void
} }
define void @fcmp_oeq_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x i32>* %c) #0 { define void @fcmp_oeq_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x i32>* %c) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: fcmp_oeq_v64f32: ; CHECK-LABEL: fcmp_oeq_v64f32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #48 ; CHECK-NEXT: ptrue p0.s, vl64
; VBITS_GE_256-NEXT: mov x9, #56 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #32 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: mov x11, #40 ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
; VBITS_GE_256-NEXT: mov x12, #16 ; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov x13, #24 ; CHECK-NEXT: st1w { z0.s }, p0, [x2]
; VBITS_GE_256-NEXT: mov x14, #8 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1, x14, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z20.s }, p0/z, [x1, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z21.s }, p0/z, [x1, x12, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z19.s }, p0/z, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z22.s }, p0/z, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z18.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z23.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z6.s, z17.s
; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z5.s, z16.s
; VBITS_GE_256-NEXT: mov z5.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z6.s, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z4.s, z21.s
; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, z20.s
; VBITS_GE_256-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z4.s, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z2.s, z22.s
; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z19.s
; VBITS_GE_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z2.s, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z18.s
; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z7.s, z23.s
; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z7.s, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x2, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x2, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x2, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x2, x13, lsl #2]
; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x2, x14, lsl #2]
; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: fcmp_oeq_v64f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
; VBITS_GE_2048-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x2]
; VBITS_GE_2048-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a %op1 = load <64 x float>, <64 x float>* %a
%op2 = load <64 x float>, <64 x float>* %b %op2 = load <64 x float>, <64 x float>* %b
%cmp = fcmp oeq <64 x float> %op1, %op2 %cmp = fcmp oeq <64 x float> %op1, %op2
@ -406,7 +230,7 @@ define void @fcmp_oeq_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x i32>* %c)
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <1 x i64> @fcmp_oeq_v1f64(<1 x double> %op1, <1 x double> %op2) #0 { define <1 x i64> @fcmp_oeq_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v1f64: ; CHECK-LABEL: fcmp_oeq_v1f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: fcmeq d0, d0, d1 ; CHECK-NEXT: fcmeq d0, d0, d1
@ -417,7 +241,7 @@ define <1 x i64> @fcmp_oeq_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) #0 { define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v2f64: ; CHECK-LABEL: fcmp_oeq_v2f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: fcmeq v0.2d, v0.2d, v1.2d ; CHECK-NEXT: fcmeq v0.2d, v0.2d, v1.2d
@ -427,7 +251,7 @@ define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
ret <2 x i64> %sext ret <2 x i64> %sext
} }
define void @fcmp_oeq_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x i64>* %c) #0 { define void @fcmp_oeq_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x i64>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oeq_v4f64: ; CHECK-LABEL: fcmp_oeq_v4f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ptrue p0.d, vl4
@ -446,7 +270,6 @@ define void @fcmp_oeq_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x i64>* %c) #
} }
define void @fcmp_oeq_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x i64>* %c) #0 { define void @fcmp_oeq_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x i64>* %c) #0 {
; Ensure sensible type legalisation
; VBITS_GE_256-LABEL: fcmp_oeq_v8f64: ; VBITS_GE_256-LABEL: fcmp_oeq_v8f64:
; VBITS_GE_256: // %bb.0: ; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4 ; VBITS_GE_256-NEXT: mov x8, #4
@ -480,44 +303,16 @@ define void @fcmp_oeq_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x i64>* %c) #
ret void ret void
} }
define void @fcmp_oeq_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x i64>* %c) #0 { define void @fcmp_oeq_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x i64>* %c) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: fcmp_oeq_v16f64: ; CHECK-LABEL: fcmp_oeq_v16f64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8 ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_256-NEXT: mov x9, #12 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #4 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] ; CHECK-NEXT: st1d { z0.d }, p0, [x2]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z2.d, z5.d
; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, z4.d
; VBITS_GE_256-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z2.d, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z6.d
; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z3.d, z7.d
; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z3.d, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x2, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: fcmp_oeq_v16f64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_1024-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
; VBITS_GE_1024-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x2]
; VBITS_GE_1024-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a %op1 = load <16 x double>, <16 x double>* %a
%op2 = load <16 x double>, <16 x double>* %b %op2 = load <16 x double>, <16 x double>* %b
%cmp = fcmp oeq <16 x double> %op1, %op2 %cmp = fcmp oeq <16 x double> %op1, %op2
@ -526,68 +321,16 @@ define void @fcmp_oeq_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x i64>* %
ret void ret void
} }
define void @fcmp_oeq_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x i64>* %c) #0 { define void @fcmp_oeq_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x i64>* %c) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: fcmp_oeq_v32f64: ; CHECK-LABEL: fcmp_oeq_v32f64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #24 ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_256-NEXT: mov x9, #28 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #16 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: mov x11, #20 ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
; VBITS_GE_256-NEXT: mov x12, #8 ; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov x13, #12 ; CHECK-NEXT: st1d { z0.d }, p0, [x2]
; VBITS_GE_256-NEXT: mov x14, #4 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1d { z16.d }, p0/z, [x1, x13, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1, x14, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z20.d }, p0/z, [x1, x11, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z21.d }, p0/z, [x1, x12, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z19.d }, p0/z, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z22.d }, p0/z, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z18.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z23.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z6.d, z17.d
; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z5.d, z16.d
; VBITS_GE_256-NEXT: mov z5.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z6.d, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z4.d, z21.d
; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z3.d, z20.d
; VBITS_GE_256-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z4.d, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z2.d, z22.d
; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, z19.d
; VBITS_GE_256-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z2.d, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z18.d
; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z7.d, z23.d
; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: mov z7.d, p2/z, #-1 // =0xffffffffffffffff
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x2, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x2, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x2, x12, lsl #3]
; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x2, x13, lsl #3]
; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x2, x14, lsl #3]
; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: fcmp_oeq_v32f64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_2048-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
; VBITS_GE_2048-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x2]
; VBITS_GE_2048-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a %op1 = load <32 x double>, <32 x double>* %a
%op2 = load <32 x double>, <32 x double>* %b %op2 = load <32 x double>, <32 x double>* %b
%cmp = fcmp oeq <32 x double> %op1, %op2 %cmp = fcmp oeq <32 x double> %op1, %op2
@ -600,7 +343,7 @@ define void @fcmp_oeq_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x i64>* %
; FCMP UEQ ; FCMP UEQ
; ;
define void @fcmp_ueq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { define void @fcmp_ueq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ueq_v16f16: ; CHECK-LABEL: fcmp_ueq_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -624,7 +367,7 @@ define void @fcmp_ueq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP ONE ; FCMP ONE
; ;
define void @fcmp_one_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { define void @fcmp_one_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_one_v16f16: ; CHECK-LABEL: fcmp_one_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -648,7 +391,7 @@ define void @fcmp_one_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP UNE ; FCMP UNE
; ;
define void @fcmp_une_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { define void @fcmp_une_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_une_v16f16: ; CHECK-LABEL: fcmp_une_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -670,7 +413,7 @@ define void @fcmp_une_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP OGT ; FCMP OGT
; ;
define void @fcmp_ogt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { define void @fcmp_ogt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ogt_v16f16: ; CHECK-LABEL: fcmp_ogt_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -692,7 +435,7 @@ define void @fcmp_ogt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP UGT ; FCMP UGT
; ;
define void @fcmp_ugt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { define void @fcmp_ugt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ugt_v16f16: ; CHECK-LABEL: fcmp_ugt_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -716,7 +459,7 @@ define void @fcmp_ugt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP OLT ; FCMP OLT
; ;
define void @fcmp_olt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { define void @fcmp_olt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_olt_v16f16: ; CHECK-LABEL: fcmp_olt_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -738,7 +481,7 @@ define void @fcmp_olt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP ULT ; FCMP ULT
; ;
define void @fcmp_ult_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { define void @fcmp_ult_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ult_v16f16: ; CHECK-LABEL: fcmp_ult_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -762,7 +505,7 @@ define void @fcmp_ult_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP OGE ; FCMP OGE
; ;
define void @fcmp_oge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { define void @fcmp_oge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_oge_v16f16: ; CHECK-LABEL: fcmp_oge_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -784,7 +527,7 @@ define void @fcmp_oge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP UGE ; FCMP UGE
; ;
define void @fcmp_uge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { define void @fcmp_uge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_uge_v16f16: ; CHECK-LABEL: fcmp_uge_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -808,7 +551,7 @@ define void @fcmp_uge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP OLE ; FCMP OLE
; ;
define void @fcmp_ole_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { define void @fcmp_ole_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ole_v16f16: ; CHECK-LABEL: fcmp_ole_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -830,7 +573,7 @@ define void @fcmp_ole_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP ULE ; FCMP ULE
; ;
define void @fcmp_ule_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { define void @fcmp_ule_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ule_v16f16: ; CHECK-LABEL: fcmp_ule_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -854,7 +597,7 @@ define void @fcmp_ule_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP UNO ; FCMP UNO
; ;
define void @fcmp_uno_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { define void @fcmp_uno_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_uno_v16f16: ; CHECK-LABEL: fcmp_uno_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -876,7 +619,7 @@ define void @fcmp_uno_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP ORD ; FCMP ORD
; ;
define void @fcmp_ord_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { define void @fcmp_ord_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ord_v16f16: ; CHECK-LABEL: fcmp_ord_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -900,7 +643,7 @@ define void @fcmp_ord_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #
; FCMP EQ ; FCMP EQ
; ;
define void @fcmp_eq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { define void @fcmp_eq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_eq_v16f16: ; CHECK-LABEL: fcmp_eq_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -922,7 +665,7 @@ define void @fcmp_eq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0
; FCMP NE ; FCMP NE
; ;
define void @fcmp_ne_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { define void @fcmp_ne_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ne_v16f16: ; CHECK-LABEL: fcmp_ne_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -944,7 +687,7 @@ define void @fcmp_ne_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0
; FCMP GT ; FCMP GT
; ;
define void @fcmp_gt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { define void @fcmp_gt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_gt_v16f16: ; CHECK-LABEL: fcmp_gt_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -966,7 +709,7 @@ define void @fcmp_gt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0
; FCMP LT ; FCMP LT
; ;
define void @fcmp_lt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { define void @fcmp_lt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_lt_v16f16: ; CHECK-LABEL: fcmp_lt_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -988,7 +731,7 @@ define void @fcmp_lt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0
; FCMP GE ; FCMP GE
; ;
define void @fcmp_ge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { define void @fcmp_ge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_ge_v16f16: ; CHECK-LABEL: fcmp_ge_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -1010,7 +753,7 @@ define void @fcmp_ge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0
; FCMP LE ; FCMP LE
; ;
define void @fcmp_le_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { define void @fcmp_le_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fcmp_le_v16f16: ; CHECK-LABEL: fcmp_le_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16

View File

@ -1,21 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; Don't use SVE when its registers are no bigger than NEON.
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
target triple = "aarch64-unknown-linux-gnu" target triple = "aarch64-unknown-linux-gnu"
@ -24,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
; ;
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define void @fcvt_v2f16_v2f32(<2 x half>* %a, <2 x float>* %b) #0 { define void @fcvt_v2f16_v2f32(<2 x half>* %a, <2 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v2f16_v2f32: ; CHECK-LABEL: fcvt_v2f16_v2f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: ldr s0, [x0]
@ -38,7 +24,7 @@ define void @fcvt_v2f16_v2f32(<2 x half>* %a, <2 x float>* %b) #0 {
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define void @fcvt_v4f16_v4f32(<4 x half>* %a, <4 x float>* %b) #0 { define void @fcvt_v4f16_v4f32(<4 x half>* %a, <4 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v4f16_v4f32: ; CHECK-LABEL: fcvt_v4f16_v4f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d0, [x0]
@ -51,7 +37,7 @@ define void @fcvt_v4f16_v4f32(<4 x half>* %a, <4 x float>* %b) #0 {
ret void ret void
} }
define void @fcvt_v8f16_v8f32(<8 x half>* %a, <8 x float>* %b) #0 { define void @fcvt_v8f16_v8f32(<8 x half>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v8f16_v8f32: ; CHECK-LABEL: fcvt_v8f16_v8f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ptrue p0.s, vl8
@ -66,7 +52,6 @@ define void @fcvt_v8f16_v8f32(<8 x half>* %a, <8 x float>* %b) #0 {
} }
define void @fcvt_v16f16_v16f32(<16 x half>* %a, <16 x float>* %b) #0 { define void @fcvt_v16f16_v16f32(<16 x half>* %a, <16 x float>* %b) #0 {
; Ensure sensible type legalisation.
; VBITS_GE_256-LABEL: fcvt_v16f16_v16f32: ; VBITS_GE_256-LABEL: fcvt_v16f16_v16f32:
; VBITS_GE_256: // %bb.0: ; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8 ; VBITS_GE_256-NEXT: mov x8, #8
@ -86,91 +71,34 @@ define void @fcvt_v16f16_v16f32(<16 x half>* %a, <16 x float>* %b) #0 {
; VBITS_GE_512-NEXT: fcvt z0.s, p0/m, z0.h ; VBITS_GE_512-NEXT: fcvt z0.s, p0/m, z0.h
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_512-NEXT: ret ; VBITS_GE_512-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a %op1 = load <16 x half>, <16 x half>* %a
%res = fpext <16 x half> %op1 to <16 x float> %res = fpext <16 x half> %op1 to <16 x float>
store <16 x float> %res, <16 x float>* %b store <16 x float> %res, <16 x float>* %b
ret void ret void
} }
define void @fcvt_v32f16_v32f32(<32 x half>* %a, <32 x float>* %b) #0 { define void @fcvt_v32f16_v32f32(<32 x half>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: fcvt_v32f16_v32f32: ; CHECK-LABEL: fcvt_v32f16_v32f32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8 ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_256-NEXT: mov x9, #24 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #16 ; CHECK-NEXT: fcvt z0.s, p0/m, z0.h
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1h { z1.s }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.s }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z0.h
; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z1.h
; VBITS_GE_256-NEXT: fcvt z2.s, p0/m, z2.h
; VBITS_GE_256-NEXT: fcvt z3.s, p0/m, z3.h
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: fcvt_v32f16_v32f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1h { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fcvt z0.s, p0/m, z0.h
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a %op1 = load <32 x half>, <32 x half>* %a
%res = fpext <32 x half> %op1 to <32 x float> %res = fpext <32 x half> %op1 to <32 x float>
store <32 x float> %res, <32 x float>* %b store <32 x float> %res, <32 x float>* %b
ret void ret void
} }
define void @fcvt_v64f16_v64f32(<64 x half>* %a, <64 x float>* %b) #0 { define void @fcvt_v64f16_v64f32(<64 x half>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: fcvt_v64f16_v64f32: ; CHECK-LABEL: fcvt_v64f16_v64f32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8 ; CHECK-NEXT: ptrue p0.s, vl64
; VBITS_GE_256-NEXT: mov x9, #24 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #16 ; CHECK-NEXT: fcvt z0.s, p0/m, z0.h
; VBITS_GE_256-NEXT: mov x11, #40 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: mov x12, #32 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov x13, #56
; VBITS_GE_256-NEXT: mov x14, #48
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.s }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.s }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.s }, p0/z, [x0, x11, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z4.s }, p0/z, [x0, x12, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z5.s }, p0/z, [x0, x13, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z6.s }, p0/z, [x0, x14, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z7.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z0.h
; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z1.h
; VBITS_GE_256-NEXT: fcvt z2.s, p0/m, z2.h
; VBITS_GE_256-NEXT: fcvt z3.s, p0/m, z3.h
; VBITS_GE_256-NEXT: fcvt z4.s, p0/m, z4.h
; VBITS_GE_256-NEXT: fcvt z5.s, p0/m, z5.h
; VBITS_GE_256-NEXT: fcvt z6.s, p0/m, z6.h
; VBITS_GE_256-NEXT: fcvt z7.s, p0/m, z7.h
; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x14, lsl #2]
; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x13, lsl #2]
; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: fcvt_v64f16_v64f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1h { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fcvt z0.s, p0/m, z0.h
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a %op1 = load <64 x half>, <64 x half>* %a
%res = fpext <64 x half> %op1 to <64 x float> %res = fpext <64 x half> %op1 to <64 x float>
store <64 x float> %res, <64 x float>* %b store <64 x float> %res, <64 x float>* %b
@ -182,7 +110,7 @@ define void @fcvt_v64f16_v64f32(<64 x half>* %a, <64 x float>* %b) #0 {
; ;
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define void @fcvt_v1f16_v1f64(<1 x half>* %a, <1 x double>* %b) #0 { define void @fcvt_v1f16_v1f64(<1 x half>* %a, <1 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v1f16_v1f64: ; CHECK-LABEL: fcvt_v1f16_v1f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ldr h0, [x0] ; CHECK-NEXT: ldr h0, [x0]
@ -196,7 +124,7 @@ define void @fcvt_v1f16_v1f64(<1 x half>* %a, <1 x double>* %b) #0 {
} }
; v2f16 is not legal for NEON, so use SVE ; v2f16 is not legal for NEON, so use SVE
define void @fcvt_v2f16_v2f64(<2 x half>* %a, <2 x double>* %b) #0 { define void @fcvt_v2f16_v2f64(<2 x half>* %a, <2 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v2f16_v2f64: ; CHECK-LABEL: fcvt_v2f16_v2f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: ldr s0, [x0]
@ -212,7 +140,7 @@ define void @fcvt_v2f16_v2f64(<2 x half>* %a, <2 x double>* %b) #0 {
ret void ret void
} }
define void @fcvt_v4f16_v4f64(<4 x half>* %a, <4 x double>* %b) #0 { define void @fcvt_v4f16_v4f64(<4 x half>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v4f16_v4f64: ; CHECK-LABEL: fcvt_v4f16_v4f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ptrue p0.d, vl4
@ -246,91 +174,34 @@ define void @fcvt_v8f16_v8f64(<8 x half>* %a, <8 x double>* %b) #0 {
; VBITS_GE_512-NEXT: fcvt z0.d, p0/m, z0.h ; VBITS_GE_512-NEXT: fcvt z0.d, p0/m, z0.h
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret ; VBITS_GE_512-NEXT: ret
%op1 = load <8 x half>, <8 x half>* %a %op1 = load <8 x half>, <8 x half>* %a
%res = fpext <8 x half> %op1 to <8 x double> %res = fpext <8 x half> %op1 to <8 x double>
store <8 x double> %res, <8 x double>* %b store <8 x double> %res, <8 x double>* %b
ret void ret void
} }
define void @fcvt_v16f16_v16f64(<16 x half>* %a, <16 x double>* %b) #0 { define void @fcvt_v16f16_v16f64(<16 x half>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: fcvt_v16f16_v16f64: ; CHECK-LABEL: fcvt_v16f16_v16f64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4 ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_256-NEXT: mov x9, #12 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #8 ; CHECK-NEXT: fcvt z0.d, p0/m, z0.h
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: ld1h { z0.d }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1h { z1.d }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.d }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: fcvt z0.d, p0/m, z0.h
; VBITS_GE_256-NEXT: fcvt z1.d, p0/m, z1.h
; VBITS_GE_256-NEXT: fcvt z2.d, p0/m, z2.h
; VBITS_GE_256-NEXT: fcvt z3.d, p0/m, z3.h
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: fcvt_v16f16_v16f64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1h { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fcvt z0.d, p0/m, z0.h
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a %op1 = load <16 x half>, <16 x half>* %a
%res = fpext <16 x half> %op1 to <16 x double> %res = fpext <16 x half> %op1 to <16 x double>
store <16 x double> %res, <16 x double>* %b store <16 x double> %res, <16 x double>* %b
ret void ret void
} }
define void @fcvt_v32f16_v32f64(<32 x half>* %a, <32 x double>* %b) #0 { define void @fcvt_v32f16_v32f64(<32 x half>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: fcvt_v32f16_v32f64: ; CHECK-LABEL: fcvt_v32f16_v32f64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4 ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_256-NEXT: mov x9, #12 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #8 ; CHECK-NEXT: fcvt z0.d, p0/m, z0.h
; VBITS_GE_256-NEXT: mov x11, #20 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: mov x12, #16 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov x13, #28
; VBITS_GE_256-NEXT: mov x14, #24
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1h { z0.d }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.d }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.d }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.d }, p0/z, [x0, x11, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z4.d }, p0/z, [x0, x12, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z5.d }, p0/z, [x0, x13, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z6.d }, p0/z, [x0, x14, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z7.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: fcvt z0.d, p0/m, z0.h
; VBITS_GE_256-NEXT: fcvt z1.d, p0/m, z1.h
; VBITS_GE_256-NEXT: fcvt z2.d, p0/m, z2.h
; VBITS_GE_256-NEXT: fcvt z3.d, p0/m, z3.h
; VBITS_GE_256-NEXT: fcvt z4.d, p0/m, z4.h
; VBITS_GE_256-NEXT: fcvt z5.d, p0/m, z5.h
; VBITS_GE_256-NEXT: fcvt z6.d, p0/m, z6.h
; VBITS_GE_256-NEXT: fcvt z7.d, p0/m, z7.h
; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x14, lsl #3]
; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x13, lsl #3]
; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x12, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: fcvt_v32f16_v32f64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1h { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fcvt z0.d, p0/m, z0.h
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a %op1 = load <32 x half>, <32 x half>* %a
%res = fpext <32 x half> %op1 to <32 x double> %res = fpext <32 x half> %op1 to <32 x double>
store <32 x double> %res, <32 x double>* %b store <32 x double> %res, <32 x double>* %b
@ -342,7 +213,7 @@ define void @fcvt_v32f16_v32f64(<32 x half>* %a, <32 x double>* %b) #0 {
; ;
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define void @fcvt_v1f32_v1f64(<1 x float>* %a, <1 x double>* %b) #0 { define void @fcvt_v1f32_v1f64(<1 x float>* %a, <1 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v1f32_v1f64: ; CHECK-LABEL: fcvt_v1f32_v1f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: ldr s0, [x0]
@ -356,7 +227,7 @@ define void @fcvt_v1f32_v1f64(<1 x float>* %a, <1 x double>* %b) #0 {
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define void @fcvt_v2f32_v2f64(<2 x float>* %a, <2 x double>* %b) #0 { define void @fcvt_v2f32_v2f64(<2 x float>* %a, <2 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v2f32_v2f64: ; CHECK-LABEL: fcvt_v2f32_v2f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d0, [x0]
@ -369,7 +240,7 @@ define void @fcvt_v2f32_v2f64(<2 x float>* %a, <2 x double>* %b) #0 {
ret void ret void
} }
define void @fcvt_v4f32_v4f64(<4 x float>* %a, <4 x double>* %b) #0 { define void @fcvt_v4f32_v4f64(<4 x float>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v4f32_v4f64: ; CHECK-LABEL: fcvt_v4f32_v4f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ptrue p0.d, vl4
@ -384,7 +255,6 @@ define void @fcvt_v4f32_v4f64(<4 x float>* %a, <4 x double>* %b) #0 {
} }
define void @fcvt_v8f32_v8f64(<8 x float>* %a, <8 x double>* %b) #0 { define void @fcvt_v8f32_v8f64(<8 x float>* %a, <8 x double>* %b) #0 {
; Ensure sensible type legalisation.
; VBITS_GE_256-LABEL: fcvt_v8f32_v8f64: ; VBITS_GE_256-LABEL: fcvt_v8f32_v8f64:
; VBITS_GE_256: // %bb.0: ; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4 ; VBITS_GE_256-NEXT: mov x8, #4
@ -410,84 +280,28 @@ define void @fcvt_v8f32_v8f64(<8 x float>* %a, <8 x double>* %b) #0 {
ret void ret void
} }
define void @fcvt_v16f32_v16f64(<16 x float>* %a, <16 x double>* %b) #0 { define void @fcvt_v16f32_v16f64(<16 x float>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: fcvt_v16f32_v16f64: ; CHECK-LABEL: fcvt_v16f32_v16f64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4 ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_256-NEXT: mov x9, #12 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #8 ; CHECK-NEXT: fcvt z0.d, p0/m, z0.s
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1w { z1.d }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.d }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: fcvt z0.d, p0/m, z0.s
; VBITS_GE_256-NEXT: fcvt z1.d, p0/m, z1.s
; VBITS_GE_256-NEXT: fcvt z2.d, p0/m, z2.s
; VBITS_GE_256-NEXT: fcvt z3.d, p0/m, z3.s
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: fcvt_v16f32_v16f64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1w { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fcvt z0.d, p0/m, z0.s
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a %op1 = load <16 x float>, <16 x float>* %a
%res = fpext <16 x float> %op1 to <16 x double> %res = fpext <16 x float> %op1 to <16 x double>
store <16 x double> %res, <16 x double>* %b store <16 x double> %res, <16 x double>* %b
ret void ret void
} }
define void @fcvt_v32f32_v32f64(<32 x float>* %a, <32 x double>* %b) #0 { define void @fcvt_v32f32_v32f64(<32 x float>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: fcvt_v32f32_v32f64: ; CHECK-LABEL: fcvt_v32f32_v32f64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4 ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_256-NEXT: mov x9, #12 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #8 ; CHECK-NEXT: fcvt z0.d, p0/m, z0.s
; VBITS_GE_256-NEXT: mov x11, #20 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: mov x12, #16 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov x13, #28
; VBITS_GE_256-NEXT: mov x14, #24
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.d }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.d }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.d }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z4.d }, p0/z, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.d }, p0/z, [x0, x13, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z6.d }, p0/z, [x0, x14, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z7.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: fcvt z0.d, p0/m, z0.s
; VBITS_GE_256-NEXT: fcvt z1.d, p0/m, z1.s
; VBITS_GE_256-NEXT: fcvt z2.d, p0/m, z2.s
; VBITS_GE_256-NEXT: fcvt z3.d, p0/m, z3.s
; VBITS_GE_256-NEXT: fcvt z4.d, p0/m, z4.s
; VBITS_GE_256-NEXT: fcvt z5.d, p0/m, z5.s
; VBITS_GE_256-NEXT: fcvt z6.d, p0/m, z6.s
; VBITS_GE_256-NEXT: fcvt z7.d, p0/m, z7.s
; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x14, lsl #3]
; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x13, lsl #3]
; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x12, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: fcvt_v32f32_v32f64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1w { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fcvt z0.d, p0/m, z0.s
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a %op1 = load <32 x float>, <32 x float>* %a
%res = fpext <32 x float> %op1 to <32 x double> %res = fpext <32 x float> %op1 to <32 x double>
store <32 x double> %res, <32 x double>* %b store <32 x double> %res, <32 x double>* %b
@ -499,7 +313,7 @@ define void @fcvt_v32f32_v32f64(<32 x float>* %a, <32 x double>* %b) #0 {
; ;
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define void @fcvt_v2f32_v2f16(<2 x float>* %a, <2 x half>* %b) #0 { define void @fcvt_v2f32_v2f16(<2 x float>* %a, <2 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v2f32_v2f16: ; CHECK-LABEL: fcvt_v2f32_v2f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d0, [x0]
@ -513,7 +327,7 @@ define void @fcvt_v2f32_v2f16(<2 x float>* %a, <2 x half>* %b) #0 {
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define void @fcvt_v4f32_v4f16(<4 x float>* %a, <4 x half>* %b) #0 { define void @fcvt_v4f32_v4f16(<4 x float>* %a, <4 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v4f32_v4f16: ; CHECK-LABEL: fcvt_v4f32_v4f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q0, [x0]
@ -526,7 +340,7 @@ define void @fcvt_v4f32_v4f16(<4 x float>* %a, <4 x half>* %b) #0 {
ret void ret void
} }
define void @fcvt_v8f32_v8f16(<8 x float>* %a, <8 x half>* %b) #0 { define void @fcvt_v8f32_v8f16(<8 x float>* %a, <8 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v8f32_v8f16: ; CHECK-LABEL: fcvt_v8f32_v8f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ptrue p0.s, vl8
@ -541,7 +355,18 @@ define void @fcvt_v8f32_v8f16(<8 x float>* %a, <8 x half>* %b) #0 {
} }
define void @fcvt_v16f32_v16f16(<16 x float>* %a, <16 x half>* %b) #0 { define void @fcvt_v16f32_v16f16(<16 x float>* %a, <16 x half>* %b) #0 {
; Ensure sensible type legalisation ; VBITS_GE_256-LABEL: fcvt_v16f32_v16f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z0.s
; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z1.s
; VBITS_GE_256-NEXT: st1h { z0.s }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: fcvt_v16f32_v16f16: ; VBITS_GE_512-LABEL: fcvt_v16f32_v16f16:
; VBITS_GE_512: // %bb.0: ; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
@ -555,90 +380,28 @@ define void @fcvt_v16f32_v16f16(<16 x float>* %a, <16 x half>* %b) #0 {
ret void ret void
} }
define void @fcvt_v32f32_v32f16(<32 x float>* %a, <32 x half>* %b) #0 { define void @fcvt_v32f32_v32f16(<32 x float>* %a, <32 x half>* %b) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: fcvt_v32f32_v32f16: ; CHECK-LABEL: fcvt_v32f32_v32f16:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #24 ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_256-NEXT: mov x9, #16 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #8 ; CHECK-NEXT: fcvt z0.h, p0/m, z0.s
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: st1h { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z1.s
; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z0.s
; VBITS_GE_256-NEXT: fcvt z2.h, p0/m, z2.s
; VBITS_GE_256-NEXT: fcvt z3.h, p0/m, z3.s
; VBITS_GE_256-NEXT: st1h { z1.s }, p0, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.s }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z2.s }, p0, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z3.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: fcvt_v32f32_v32f16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fcvt z0.h, p0/m, z0.s
; VBITS_GE_1024-NEXT: st1h { z0.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a %op1 = load <32 x float>, <32 x float>* %a
%res = fptrunc <32 x float> %op1 to <32 x half> %res = fptrunc <32 x float> %op1 to <32 x half>
store <32 x half> %res, <32 x half>* %b store <32 x half> %res, <32 x half>* %b
ret void ret void
} }
define void @fcvt_v64f32_v64f16(<64 x float>* %a, <64 x half>* %b) #0 { define void @fcvt_v64f32_v64f16(<64 x float>* %a, <64 x half>* %b) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: fcvt_v64f32_v64f16: ; CHECK-LABEL: fcvt_v64f32_v64f16:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #56 ; CHECK-NEXT: ptrue p0.s, vl64
; VBITS_GE_256-NEXT: mov x10, #48 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: fcvt z0.h, p0/m, z0.s
; VBITS_GE_256-NEXT: mov x8, #8 ; CHECK-NEXT: st1h { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: mov x11, #24 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov x12, #16
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: mov x13, #40
; VBITS_GE_256-NEXT: mov x14, #32
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z1.s
; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z0.s
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x13, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x14, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1h { z1.s }, p0, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.s }, p0, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: movprfx z0, z5
; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z5.s
; VBITS_GE_256-NEXT: movprfx z1, z4
; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z4.s
; VBITS_GE_256-NEXT: st1h { z0.s }, p0, [x1, x14, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.s }, p0, [x1, x13, lsl #1]
; VBITS_GE_256-NEXT: movprfx z0, z6
; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z6.s
; VBITS_GE_256-NEXT: movprfx z1, z3
; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z3.s
; VBITS_GE_256-NEXT: st1h { z0.s }, p0, [x1, x12, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.s }, p0, [x1, x11, lsl #1]
; VBITS_GE_256-NEXT: movprfx z0, z2
; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z2.s
; VBITS_GE_256-NEXT: movprfx z1, z7
; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z7.s
; VBITS_GE_256-NEXT: st1h { z0.s }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: fcvt_v64f32_v64f16:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fcvt z0.h, p0/m, z0.s
; VBITS_GE_2048-NEXT: st1h { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a %op1 = load <64 x float>, <64 x float>* %a
%res = fptrunc <64 x float> %op1 to <64 x half> %res = fptrunc <64 x float> %op1 to <64 x half>
store <64 x half> %res, <64 x half>* %b store <64 x half> %res, <64 x half>* %b
@ -650,7 +413,7 @@ define void @fcvt_v64f32_v64f16(<64 x float>* %a, <64 x half>* %b) #0 {
; ;
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define void @fcvt_v1f64_v1f16(<1 x double>* %a, <1 x half>* %b) #0 { define void @fcvt_v1f64_v1f16(<1 x double>* %a, <1 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v1f64_v1f16: ; CHECK-LABEL: fcvt_v1f64_v1f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d0, [x0]
@ -664,7 +427,7 @@ define void @fcvt_v1f64_v1f16(<1 x double>* %a, <1 x half>* %b) #0 {
} }
; v2f16 is not legal for NEON, so use SVE ; v2f16 is not legal for NEON, so use SVE
define void @fcvt_v2f64_v2f16(<2 x double>* %a, <2 x half>* %b) #0 { define void @fcvt_v2f64_v2f16(<2 x double>* %a, <2 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v2f64_v2f16: ; CHECK-LABEL: fcvt_v2f64_v2f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q0, [x0]
@ -680,7 +443,7 @@ define void @fcvt_v2f64_v2f16(<2 x double>* %a, <2 x half>* %b) #0 {
ret void ret void
} }
define void @fcvt_v4f64_v4f16(<4 x double>* %a, <4 x half>* %b) #0 { define void @fcvt_v4f64_v4f16(<4 x double>* %a, <4 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v4f64_v4f16: ; CHECK-LABEL: fcvt_v4f64_v4f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ptrue p0.d, vl4
@ -695,7 +458,6 @@ define void @fcvt_v4f64_v4f16(<4 x double>* %a, <4 x half>* %b) #0 {
} }
define void @fcvt_v8f64_v8f16(<8 x double>* %a, <8 x half>* %b) #0 { define void @fcvt_v8f64_v8f16(<8 x double>* %a, <8 x half>* %b) #0 {
; Ensure sensible type legalisation
; VBITS_GE_256-LABEL: fcvt_v8f64_v8f16: ; VBITS_GE_256-LABEL: fcvt_v8f64_v8f16:
; VBITS_GE_256: // %bb.0: ; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4 ; VBITS_GE_256-NEXT: mov x8, #4
@ -726,70 +488,28 @@ define void @fcvt_v8f64_v8f16(<8 x double>* %a, <8 x half>* %b) #0 {
ret void ret void
} }
define void @fcvt_v16f64_v16f16(<16 x double>* %a, <16 x half>* %b) #0 { define void @fcvt_v16f64_v16f16(<16 x double>* %a, <16 x half>* %b) vscale_range(8,0) #0 {
; VBITS_GE_1024-LABEL: fcvt_v16f64_v16f16: ; CHECK-LABEL: fcvt_v16f64_v16f16:
; VBITS_GE_1024: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fcvt z0.h, p0/m, z0.d ; CHECK-NEXT: fcvt z0.h, p0/m, z0.d
; VBITS_GE_1024-NEXT: st1h { z0.d }, p0, [x1] ; CHECK-NEXT: st1h { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a %op1 = load <16 x double>, <16 x double>* %a
%res = fptrunc <16 x double> %op1 to <16 x half> %res = fptrunc <16 x double> %op1 to <16 x half>
store <16 x half> %res, <16 x half>* %b store <16 x half> %res, <16 x half>* %b
ret void ret void
} }
define void @fcvt_v32f64_v32f16(<32 x double>* %a, <32 x half>* %b) #0 { define void @fcvt_v32f64_v32f16(<32 x double>* %a, <32 x half>* %b) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: fcvt_v32f64_v32f16: ; CHECK-LABEL: fcvt_v32f64_v32f16:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #28 ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_256-NEXT: mov x10, #24 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: fcvt z0.h, p0/m, z0.d
; VBITS_GE_256-NEXT: mov x8, #4 ; CHECK-NEXT: st1h { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: mov x11, #12 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov x12, #8
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: mov x13, #20
; VBITS_GE_256-NEXT: mov x14, #16
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x12, lsl #3]
; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z1.d
; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z0.d
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x13, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1h { z1.d }, p0, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: movprfx z0, z5
; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z5.d
; VBITS_GE_256-NEXT: movprfx z1, z4
; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z4.d
; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [x1, x14, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.d }, p0, [x1, x13, lsl #1]
; VBITS_GE_256-NEXT: movprfx z0, z6
; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z6.d
; VBITS_GE_256-NEXT: movprfx z1, z3
; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z3.d
; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [x1, x12, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.d }, p0, [x1, x11, lsl #1]
; VBITS_GE_256-NEXT: movprfx z0, z2
; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z2.d
; VBITS_GE_256-NEXT: movprfx z1, z7
; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z7.d
; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: fcvt_v32f64_v32f16:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fcvt z0.h, p0/m, z0.d
; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a %op1 = load <32 x double>, <32 x double>* %a
%res = fptrunc <32 x double> %op1 to <32 x half> %res = fptrunc <32 x double> %op1 to <32 x half>
store <32 x half> %res, <32 x half>* %b store <32 x half> %res, <32 x half>* %b
@ -801,7 +521,7 @@ define void @fcvt_v32f64_v32f16(<32 x double>* %a, <32 x half>* %b) #0 {
; ;
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define void @fcvt_v1f64_v1f32(<1 x double> %op1, <1 x float>* %b) #0 { define void @fcvt_v1f64_v1f32(<1 x double> %op1, <1 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v1f64_v1f32: ; CHECK-LABEL: fcvt_v1f64_v1f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@ -814,7 +534,7 @@ define void @fcvt_v1f64_v1f32(<1 x double> %op1, <1 x float>* %b) #0 {
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define void @fcvt_v2f64_v2f32(<2 x double> %op1, <2 x float>* %b) #0 { define void @fcvt_v2f64_v2f32(<2 x double> %op1, <2 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v2f64_v2f32: ; CHECK-LABEL: fcvt_v2f64_v2f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: fcvtn v0.2s, v0.2d ; CHECK-NEXT: fcvtn v0.2s, v0.2d
@ -825,7 +545,7 @@ define void @fcvt_v2f64_v2f32(<2 x double> %op1, <2 x float>* %b) #0 {
ret void ret void
} }
define void @fcvt_v4f64_v4f32(<4 x double>* %a, <4 x float>* %b) #0 { define void @fcvt_v4f64_v4f32(<4 x double>* %a, <4 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: fcvt_v4f64_v4f32: ; CHECK-LABEL: fcvt_v4f64_v4f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ptrue p0.d, vl4
@ -840,7 +560,18 @@ define void @fcvt_v4f64_v4f32(<4 x double>* %a, <4 x float>* %b) #0 {
} }
define void @fcvt_v8f64_v8f32(<8 x double>* %a, <8 x float>* %b) #0 { define void @fcvt_v8f64_v8f32(<8 x double>* %a, <8 x float>* %b) #0 {
; Ensure sensible type legalisation ; VBITS_GE_256-LABEL: fcvt_v8f64_v8f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z0.d
; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z1.d
; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: fcvt_v8f64_v8f32: ; VBITS_GE_512-LABEL: fcvt_v8f64_v8f32:
; VBITS_GE_512: // %bb.0: ; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
@ -854,90 +585,28 @@ define void @fcvt_v8f64_v8f32(<8 x double>* %a, <8 x float>* %b) #0 {
ret void ret void
} }
define void @fcvt_v16f64_v16f32(<16 x double>* %a, <16 x float>* %b) #0 { define void @fcvt_v16f64_v16f32(<16 x double>* %a, <16 x float>* %b) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: fcvt_v16f64_v16f32: ; CHECK-LABEL: fcvt_v16f64_v16f32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #12 ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_256-NEXT: mov x9, #8 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #4 ; CHECK-NEXT: fcvt z0.s, p0/m, z0.d
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: st1w { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z1.d
; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z0.d
; VBITS_GE_256-NEXT: fcvt z2.s, p0/m, z2.d
; VBITS_GE_256-NEXT: fcvt z3.s, p0/m, z3.d
; VBITS_GE_256-NEXT: st1w { z1.d }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.d }, p0, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: fcvt_v16f64_v16f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fcvt z0.s, p0/m, z0.d
; VBITS_GE_1024-NEXT: st1w { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a %op1 = load <16 x double>, <16 x double>* %a
%res = fptrunc <16 x double> %op1 to <16 x float> %res = fptrunc <16 x double> %op1 to <16 x float>
store <16 x float> %res, <16 x float>* %b store <16 x float> %res, <16 x float>* %b
ret void ret void
} }
define void @fcvt_v32f64_v32f32(<32 x double>* %a, <32 x float>* %b) #0 { define void @fcvt_v32f64_v32f32(<32 x double>* %a, <32 x float>* %b) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: fcvt_v32f64_v32f32: ; CHECK-LABEL: fcvt_v32f64_v32f32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #28 ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_256-NEXT: mov x10, #24 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: fcvt z0.s, p0/m, z0.d
; VBITS_GE_256-NEXT: mov x8, #4 ; CHECK-NEXT: st1w { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: mov x11, #12 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov x12, #8
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: mov x13, #20
; VBITS_GE_256-NEXT: mov x14, #16
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x12, lsl #3]
; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z1.d
; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z0.d
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x13, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1w { z1.d }, p0, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: movprfx z0, z5
; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z5.d
; VBITS_GE_256-NEXT: movprfx z1, z4
; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z4.d
; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [x1, x14, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.d }, p0, [x1, x13, lsl #2]
; VBITS_GE_256-NEXT: movprfx z0, z6
; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z6.d
; VBITS_GE_256-NEXT: movprfx z1, z3
; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z3.d
; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [x1, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.d }, p0, [x1, x11, lsl #2]
; VBITS_GE_256-NEXT: movprfx z0, z2
; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z2.d
; VBITS_GE_256-NEXT: movprfx z1, z7
; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z7.d
; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: fcvt_v32f64_v32f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fcvt z0.s, p0/m, z0.d
; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a %op1 = load <32 x double>, <32 x double>* %a
%res = fptrunc <32 x double> %op1 to <32 x float> %res = fptrunc <32 x double> %op1 to <32 x float>
store <32 x float> %res, <32 x float>* %b store <32 x float> %res, <32 x float>* %b

View File

@ -1,5 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -O3 -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s ; RUN: llc -O3 -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -O3 -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -O3 -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu" target triple = "aarch64-unknown-linux-gnu"
@ -8,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
; ;
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) #0 { define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v4f16: ; CHECK-LABEL: fma_v4f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: fmla v2.4h, v0.4h, v1.4h ; CHECK-NEXT: fmla v2.4h, v0.4h, v1.4h
@ -20,7 +22,7 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) #0 { define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v8f16: ; CHECK-LABEL: fma_v8f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: fmla v2.8h, v0.8h, v1.8h ; CHECK-NEXT: fmla v2.8h, v0.8h, v1.8h
@ -31,7 +33,7 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
ret <8 x half> %res ret <8 x half> %res
} }
define void @fma_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x half>* %c) #0 { define void @fma_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x half>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v16f16: ; CHECK-LABEL: fma_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -51,15 +53,31 @@ define void @fma_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x half>* %c) #0 {
} }
define void @fma_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x half>* %c) #0 { define void @fma_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x half>* %c) #0 {
; CHECK-LABEL: fma_v32f16: ; VBITS_GE_256-LABEL: fma_v32f16:
; CHECK: // %bb.0: ; VBITS_GE_256: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl32 ; VBITS_GE_256-NEXT: mov x8, #16
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; CHECK-NEXT: ld1h { z2.h }, p0/z, [x2] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
; CHECK-NEXT: ret ; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x2, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x2]
; VBITS_GE_256-NEXT: fmad z0.h, p0/m, z2.h, z4.h
; VBITS_GE_256-NEXT: fmad z1.h, p0/m, z3.h, z5.h
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: fma_v32f16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_512-NEXT: ld1h { z2.h }, p0/z, [x2]
; VBITS_GE_512-NEXT: fmad z0.h, p0/m, z1.h, z2.h
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a %op1 = load <32 x half>, <32 x half>* %a
%op2 = load <32 x half>, <32 x half>* %b %op2 = load <32 x half>, <32 x half>* %b
%op3 = load <32 x half>, <32 x half>* %c %op3 = load <32 x half>, <32 x half>* %c
@ -69,7 +87,7 @@ define void @fma_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x half>* %c) #0 {
ret void ret void
} }
define void @fma_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x half>* %c) #0 { define void @fma_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x half>* %c) vscale_range(8,0) #0 {
; CHECK-LABEL: fma_v64f16: ; CHECK-LABEL: fma_v64f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: ptrue p0.h, vl64
@ -88,7 +106,7 @@ define void @fma_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x half>* %c) #0 {
ret void ret void
} }
define void @fma_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x half>* %c) #0 { define void @fma_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x half>* %c) vscale_range(16,0) #0 {
; CHECK-LABEL: fma_v128f16: ; CHECK-LABEL: fma_v128f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: ptrue p0.h, vl128
@ -108,7 +126,7 @@ define void @fma_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x half>* %c) #
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3) #0 { define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v2f32: ; CHECK-LABEL: fma_v2f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: fmla v2.2s, v0.2s, v1.2s ; CHECK-NEXT: fmla v2.2s, v0.2s, v1.2s
@ -120,7 +138,7 @@ define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %o
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3) #0 { define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v4f32: ; CHECK-LABEL: fma_v4f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: fmla v2.4s, v0.4s, v1.4s ; CHECK-NEXT: fmla v2.4s, v0.4s, v1.4s
@ -131,7 +149,7 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o
ret <4 x float> %res ret <4 x float> %res
} }
define void @fma_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) #0 { define void @fma_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v8f32: ; CHECK-LABEL: fma_v8f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ptrue p0.s, vl8
@ -151,15 +169,31 @@ define void @fma_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) #0 {
} }
define void @fma_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x float>* %c) #0 { define void @fma_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x float>* %c) #0 {
; CHECK-LABEL: fma_v16f32: ; VBITS_GE_256-LABEL: fma_v16f32:
; CHECK: // %bb.0: ; VBITS_GE_256: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl16 ; VBITS_GE_256-NEXT: mov x8, #8
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
; CHECK-NEXT: ret ; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x2, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x2]
; VBITS_GE_256-NEXT: fmad z0.s, p0/m, z2.s, z4.s
; VBITS_GE_256-NEXT: fmad z1.s, p0/m, z3.s, z5.s
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: fma_v16f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x2]
; VBITS_GE_512-NEXT: fmad z0.s, p0/m, z1.s, z2.s
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a %op1 = load <16 x float>, <16 x float>* %a
%op2 = load <16 x float>, <16 x float>* %b %op2 = load <16 x float>, <16 x float>* %b
%op3 = load <16 x float>, <16 x float>* %c %op3 = load <16 x float>, <16 x float>* %c
@ -169,7 +203,7 @@ define void @fma_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x float>* %c) #0
ret void ret void
} }
define void @fma_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x float>* %c) #0 { define void @fma_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x float>* %c) vscale_range(8,0) #0 {
; CHECK-LABEL: fma_v32f32: ; CHECK-LABEL: fma_v32f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ptrue p0.s, vl32
@ -188,7 +222,7 @@ define void @fma_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x float>* %c) #0
ret void ret void
} }
define void @fma_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x float>* %c) #0 { define void @fma_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x float>* %c) vscale_range(16,0) #0 {
; CHECK-LABEL: fma_v64f32: ; CHECK-LABEL: fma_v64f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ptrue p0.s, vl64
@ -208,7 +242,7 @@ define void @fma_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x float>* %c) #0
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double> %op3) #0 { define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v1f64: ; CHECK-LABEL: fma_v1f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: fmadd d0, d0, d1, d2 ; CHECK-NEXT: fmadd d0, d0, d1, d2
@ -219,7 +253,7 @@ define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3) #0 { define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v2f64: ; CHECK-LABEL: fma_v2f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: fmla v2.2d, v0.2d, v1.2d ; CHECK-NEXT: fmla v2.2d, v0.2d, v1.2d
@ -230,7 +264,7 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double
ret <2 x double> %res ret <2 x double> %res
} }
define void @fma_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x double>* %c) #0 { define void @fma_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x double>* %c) vscale_range(2,0) #0 {
; CHECK-LABEL: fma_v4f64: ; CHECK-LABEL: fma_v4f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ptrue p0.d, vl4
@ -250,15 +284,31 @@ define void @fma_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x double>* %c) #0
} }
define void @fma_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x double>* %c) #0 { define void @fma_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x double>* %c) #0 {
; CHECK-LABEL: fma_v8f64: ; VBITS_GE_256-LABEL: fma_v8f64:
; CHECK: // %bb.0: ; VBITS_GE_256: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl8 ; VBITS_GE_256-NEXT: mov x8, #4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
; CHECK-NEXT: ret ; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x2, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x2]
; VBITS_GE_256-NEXT: fmad z0.d, p0/m, z2.d, z4.d
; VBITS_GE_256-NEXT: fmad z1.d, p0/m, z3.d, z5.d
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: fma_v8f64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x2]
; VBITS_GE_512-NEXT: fmad z0.d, p0/m, z1.d, z2.d
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a %op1 = load <8 x double>, <8 x double>* %a
%op2 = load <8 x double>, <8 x double>* %b %op2 = load <8 x double>, <8 x double>* %b
%op3 = load <8 x double>, <8 x double>* %c %op3 = load <8 x double>, <8 x double>* %c
@ -268,7 +318,7 @@ define void @fma_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x double>* %c) #0
ret void ret void
} }
define void @fma_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x double>* %c) #0 { define void @fma_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x double>* %c) vscale_range(8,0) #0 {
; CHECK-LABEL: fma_v16f64: ; CHECK-LABEL: fma_v16f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ptrue p0.d, vl16
@ -287,7 +337,7 @@ define void @fma_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x double>* %c)
ret void ret void
} }
define void @fma_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x double>* %c) #0 { define void @fma_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x double>* %c) vscale_range(16,0) #0 {
; CHECK-LABEL: fma_v32f64: ; CHECK-LABEL: fma_v32f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ptrue p0.d, vl32

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,36 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
target triple = "aarch64-unknown-linux-gnu" target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE when its registers are no bigger than NEON.
; NO_SVE-NOT: ptrue
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) #0 { define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) vscale_range(2,0) #0 {
; NO_SVE-LABEL: select_v4f16:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: tst w0, #0x1
; NO_SVE-NEXT: csetm w8, ne
; NO_SVE-NEXT: dup v2.4h, w8
; NO_SVE-NEXT: bif v0.8b, v1.8b, v2.8b
; NO_SVE-NEXT: ret
;
; CHECK-LABEL: select_v4f16: ; CHECK-LABEL: select_v4f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: tst w0, #0x1
@ -43,15 +19,7 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) #0 {
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) #0 { define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) vscale_range(2,0) #0 {
; NO_SVE-LABEL: select_v8f16:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: tst w0, #0x1
; NO_SVE-NEXT: csetm w8, ne
; NO_SVE-NEXT: dup v2.8h, w8
; NO_SVE-NEXT: bif v0.16b, v1.16b, v2.16b
; NO_SVE-NEXT: ret
;
; CHECK-LABEL: select_v8f16: ; CHECK-LABEL: select_v8f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: tst w0, #0x1
@ -63,21 +31,7 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) #0 {
ret <8 x half> %sel ret <8 x half> %sel
} }
define void @select_v16f16(<16 x half>* %a, <16 x half>* %b, i1 %mask) #0 { define void @select_v16f16(<16 x half>* %a, <16 x half>* %b, i1 %mask) vscale_range(2,0) #0 {
; NO_SVE-LABEL: select_v16f16:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: tst w2, #0x1
; NO_SVE-NEXT: ldr q0, [x0]
; NO_SVE-NEXT: csetm w8, ne
; NO_SVE-NEXT: ldr q1, [x0, #16]
; NO_SVE-NEXT: ldr q2, [x1]
; NO_SVE-NEXT: ldr q3, [x1, #16]
; NO_SVE-NEXT: dup v4.8h, w8
; NO_SVE-NEXT: bif v0.16b, v2.16b, v4.16b
; NO_SVE-NEXT: bif v1.16b, v3.16b, v4.16b
; NO_SVE-NEXT: stp q0, q1, [x0]
; NO_SVE-NEXT: ret
;
; CHECK-LABEL: select_v16f16: ; CHECK-LABEL: select_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1 ; CHECK-NEXT: and w8, w2, #0x1
@ -99,26 +53,24 @@ define void @select_v16f16(<16 x half>* %a, <16 x half>* %b, i1 %mask) #0 {
} }
define void @select_v32f16(<32 x half>* %a, <32 x half>* %b, i1 %mask) #0 { define void @select_v32f16(<32 x half>* %a, <32 x half>* %b, i1 %mask) #0 {
; NO_SVE-LABEL: select_v32f16: ; VBITS_GE_256-LABEL: select_v32f16:
; NO_SVE: // %bb.0: ; VBITS_GE_256: // %bb.0:
; NO_SVE-NEXT: tst w2, #0x1 ; VBITS_GE_256-NEXT: mov x8, #16
; NO_SVE-NEXT: ldr q0, [x0, #48] ; VBITS_GE_256-NEXT: and w9, w2, #0x1
; NO_SVE-NEXT: csetm w8, ne ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; NO_SVE-NEXT: ldr q1, [x0] ; VBITS_GE_256-NEXT: ptrue p1.h
; NO_SVE-NEXT: ldr q2, [x0, #16] ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; NO_SVE-NEXT: ldr q3, [x0, #32] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; NO_SVE-NEXT: ldr q4, [x1, #48] ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
; NO_SVE-NEXT: dup v6.8h, w8 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
; NO_SVE-NEXT: ldr q5, [x1] ; VBITS_GE_256-NEXT: mov z4.h, w9
; NO_SVE-NEXT: ldr q7, [x1, #16] ; VBITS_GE_256-NEXT: and z4.h, z4.h, #0x1
; NO_SVE-NEXT: ldr q16, [x1, #32] ; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z4.h, #0
; NO_SVE-NEXT: bif v1.16b, v5.16b, v6.16b ; VBITS_GE_256-NEXT: sel z1.h, p1, z1.h, z3.h
; NO_SVE-NEXT: bif v2.16b, v7.16b, v6.16b ; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z2.h
; NO_SVE-NEXT: bif v0.16b, v4.16b, v6.16b ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; NO_SVE-NEXT: bif v3.16b, v16.16b, v6.16b ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; NO_SVE-NEXT: stp q1, q2, [x0] ; VBITS_GE_256-NEXT: ret
; NO_SVE-NEXT: stp q3, q0, [x0, #32]
; NO_SVE-NEXT: ret
; ;
; VBITS_GE_512-LABEL: select_v32f16: ; VBITS_GE_512-LABEL: select_v32f16:
; VBITS_GE_512: // %bb.0: ; VBITS_GE_512: // %bb.0:
@ -140,58 +92,20 @@ define void @select_v32f16(<32 x half>* %a, <32 x half>* %b, i1 %mask) #0 {
ret void ret void
} }
define void @select_v64f16(<64 x half>* %a, <64 x half>* %b, i1 %mask) #0 { define void @select_v64f16(<64 x half>* %a, <64 x half>* %b, i1 %mask) vscale_range(8,0) #0 {
; NO_SVE-LABEL: select_v64f16: ; CHECK-LABEL: select_v64f16:
; NO_SVE: // %bb.0: ; CHECK: // %bb.0:
; NO_SVE-NEXT: tst w2, #0x1 ; CHECK-NEXT: and w8, w2, #0x1
; NO_SVE-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: ptrue p0.h, vl64
; NO_SVE-NEXT: csetm w8, ne ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; NO_SVE-NEXT: ldr q1, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; NO_SVE-NEXT: ldr q2, [x0, #48] ; CHECK-NEXT: ptrue p1.h
; NO_SVE-NEXT: ldr q3, [x0, #32] ; CHECK-NEXT: mov z2.h, w8
; NO_SVE-NEXT: ldr q4, [x0, #80] ; CHECK-NEXT: and z2.h, z2.h, #0x1
; NO_SVE-NEXT: dup v21.8h, w8 ; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
; NO_SVE-NEXT: ldr q5, [x0, #64] ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; NO_SVE-NEXT: ldr q6, [x0, #112] ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; NO_SVE-NEXT: ldr q7, [x0, #96] ; CHECK-NEXT: ret
; NO_SVE-NEXT: ldr q16, [x1, #16]
; NO_SVE-NEXT: ldr q17, [x1]
; NO_SVE-NEXT: ldr q18, [x1, #48]
; NO_SVE-NEXT: ldr q19, [x1, #32]
; NO_SVE-NEXT: bif v0.16b, v16.16b, v21.16b
; NO_SVE-NEXT: ldr q20, [x1, #80]
; NO_SVE-NEXT: bif v1.16b, v17.16b, v21.16b
; NO_SVE-NEXT: ldr q16, [x1, #64]
; NO_SVE-NEXT: bif v2.16b, v18.16b, v21.16b
; NO_SVE-NEXT: ldr q17, [x1, #112]
; NO_SVE-NEXT: bif v3.16b, v19.16b, v21.16b
; NO_SVE-NEXT: ldr q18, [x1, #96]
; NO_SVE-NEXT: bif v4.16b, v20.16b, v21.16b
; NO_SVE-NEXT: stp q1, q0, [x0]
; NO_SVE-NEXT: mov v0.16b, v21.16b
; NO_SVE-NEXT: mov v1.16b, v21.16b
; NO_SVE-NEXT: stp q3, q2, [x0, #32]
; NO_SVE-NEXT: mov v2.16b, v21.16b
; NO_SVE-NEXT: bsl v0.16b, v5.16b, v16.16b
; NO_SVE-NEXT: bsl v1.16b, v6.16b, v17.16b
; NO_SVE-NEXT: bsl v2.16b, v7.16b, v18.16b
; NO_SVE-NEXT: stp q0, q4, [x0, #64]
; NO_SVE-NEXT: stp q2, q1, [x0, #96]
; NO_SVE-NEXT: ret
;
; VBITS_GE_1024-LABEL: select_v64f16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: and w8, w2, #0x1
; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_1024-NEXT: ptrue p1.h
; VBITS_GE_1024-NEXT: mov z2.h, w8
; VBITS_GE_1024-NEXT: and z2.h, z2.h, #0x1
; VBITS_GE_1024-NEXT: cmpne p1.h, p1/z, z2.h, #0
; VBITS_GE_1024-NEXT: sel z0.h, p1, z0.h, z1.h
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
%op1 = load volatile <64 x half>, <64 x half>* %a %op1 = load volatile <64 x half>, <64 x half>* %a
%op2 = load volatile <64 x half>, <64 x half>* %b %op2 = load volatile <64 x half>, <64 x half>* %b
%sel = select i1 %mask, <64 x half> %op1, <64 x half> %op2 %sel = select i1 %mask, <64 x half> %op1, <64 x half> %op2
@ -199,103 +113,20 @@ define void @select_v64f16(<64 x half>* %a, <64 x half>* %b, i1 %mask) #0 {
ret void ret void
} }
define void @select_v128f16(<128 x half>* %a, <128 x half>* %b, i1 %mask) #0 { define void @select_v128f16(<128 x half>* %a, <128 x half>* %b, i1 %mask) vscale_range(16,0) #0 {
; NO_SVE-LABEL: select_v128f16: ; CHECK-LABEL: select_v128f16:
; NO_SVE: // %bb.0: ; CHECK: // %bb.0:
; NO_SVE-NEXT: stp d11, d10, [sp, #-32]! // 16-byte Folded Spill ; CHECK-NEXT: and w8, w2, #0x1
; NO_SVE-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: ptrue p0.h, vl128
; NO_SVE-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; NO_SVE-NEXT: .cfi_offset b8, -8 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; NO_SVE-NEXT: .cfi_offset b9, -16 ; CHECK-NEXT: ptrue p1.h
; NO_SVE-NEXT: .cfi_offset b10, -24 ; CHECK-NEXT: mov z2.h, w8
; NO_SVE-NEXT: .cfi_offset b11, -32 ; CHECK-NEXT: and z2.h, z2.h, #0x1
; NO_SVE-NEXT: tst w2, #0x1 ; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
; NO_SVE-NEXT: ldr q0, [x0, #240] ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; NO_SVE-NEXT: csetm w8, ne ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; NO_SVE-NEXT: ldr q1, [x0, #224] ; CHECK-NEXT: ret
; NO_SVE-NEXT: ldr q2, [x0, #208]
; NO_SVE-NEXT: ldr q3, [x0, #192]
; NO_SVE-NEXT: ldr q4, [x0, #176]
; NO_SVE-NEXT: dup v8.8h, w8
; NO_SVE-NEXT: ldr q5, [x0, #160]
; NO_SVE-NEXT: ldr q6, [x0, #144]
; NO_SVE-NEXT: ldr q7, [x0, #128]
; NO_SVE-NEXT: ldr q16, [x0, #112]
; NO_SVE-NEXT: ldr q17, [x0, #96]
; NO_SVE-NEXT: ldr q18, [x0, #80]
; NO_SVE-NEXT: ldr q19, [x0, #64]
; NO_SVE-NEXT: ldr q20, [x0, #48]
; NO_SVE-NEXT: ldr q21, [x0, #32]
; NO_SVE-NEXT: ldr q22, [x0, #16]
; NO_SVE-NEXT: ldr q23, [x0]
; NO_SVE-NEXT: ldr q24, [x1, #240]
; NO_SVE-NEXT: ldr q25, [x1, #224]
; NO_SVE-NEXT: ldr q26, [x1, #208]
; NO_SVE-NEXT: ldr q27, [x1, #192]
; NO_SVE-NEXT: bif v0.16b, v24.16b, v8.16b
; NO_SVE-NEXT: ldr q28, [x1, #176]
; NO_SVE-NEXT: bif v1.16b, v25.16b, v8.16b
; NO_SVE-NEXT: ldr q29, [x1, #160]
; NO_SVE-NEXT: bif v2.16b, v26.16b, v8.16b
; NO_SVE-NEXT: ldr q30, [x1, #144]
; NO_SVE-NEXT: bif v3.16b, v27.16b, v8.16b
; NO_SVE-NEXT: ldr q31, [x1, #128]
; NO_SVE-NEXT: ldr q9, [x1, #112]
; NO_SVE-NEXT: ldr q10, [x1, #96]
; NO_SVE-NEXT: bif v4.16b, v28.16b, v8.16b
; NO_SVE-NEXT: ldr q28, [x1, #80]
; NO_SVE-NEXT: ldr q24, [x1, #64]
; NO_SVE-NEXT: ldr q25, [x1, #48]
; NO_SVE-NEXT: ldr q26, [x1, #32]
; NO_SVE-NEXT: ldr q27, [x1, #16]
; NO_SVE-NEXT: ldr q11, [x1]
; NO_SVE-NEXT: stp q3, q2, [x0, #192]
; NO_SVE-NEXT: stp q1, q0, [x0, #224]
; NO_SVE-NEXT: mov v0.16b, v8.16b
; NO_SVE-NEXT: mov v1.16b, v8.16b
; NO_SVE-NEXT: mov v2.16b, v8.16b
; NO_SVE-NEXT: bsl v0.16b, v5.16b, v29.16b
; NO_SVE-NEXT: bsl v1.16b, v6.16b, v30.16b
; NO_SVE-NEXT: bsl v2.16b, v7.16b, v31.16b
; NO_SVE-NEXT: mov v3.16b, v8.16b
; NO_SVE-NEXT: stp q0, q4, [x0, #160]
; NO_SVE-NEXT: mov v4.16b, v8.16b
; NO_SVE-NEXT: mov v0.16b, v8.16b
; NO_SVE-NEXT: stp q2, q1, [x0, #128]
; NO_SVE-NEXT: mov v1.16b, v8.16b
; NO_SVE-NEXT: bsl v3.16b, v16.16b, v9.16b
; NO_SVE-NEXT: bsl v4.16b, v17.16b, v10.16b
; NO_SVE-NEXT: bsl v0.16b, v18.16b, v28.16b
; NO_SVE-NEXT: bsl v1.16b, v19.16b, v24.16b
; NO_SVE-NEXT: mov v2.16b, v8.16b
; NO_SVE-NEXT: stp q4, q3, [x0, #96]
; NO_SVE-NEXT: mov v3.16b, v8.16b
; NO_SVE-NEXT: mov v4.16b, v8.16b
; NO_SVE-NEXT: stp q1, q0, [x0, #64]
; NO_SVE-NEXT: mov v0.16b, v8.16b
; NO_SVE-NEXT: bsl v2.16b, v20.16b, v25.16b
; NO_SVE-NEXT: bsl v3.16b, v21.16b, v26.16b
; NO_SVE-NEXT: bsl v4.16b, v22.16b, v27.16b
; NO_SVE-NEXT: bsl v0.16b, v23.16b, v11.16b
; NO_SVE-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload
; NO_SVE-NEXT: stp q3, q2, [x0, #32]
; NO_SVE-NEXT: stp q0, q4, [x0]
; NO_SVE-NEXT: ldp d11, d10, [sp], #32 // 16-byte Folded Reload
; NO_SVE-NEXT: ret
;
; VBITS_GE_2048-LABEL: select_v128f16:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: and w8, w2, #0x1
; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_2048-NEXT: ptrue p1.h
; VBITS_GE_2048-NEXT: mov z2.h, w8
; VBITS_GE_2048-NEXT: and z2.h, z2.h, #0x1
; VBITS_GE_2048-NEXT: cmpne p1.h, p1/z, z2.h, #0
; VBITS_GE_2048-NEXT: sel z0.h, p1, z0.h, z1.h
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
%op1 = load volatile <128 x half>, <128 x half>* %a %op1 = load volatile <128 x half>, <128 x half>* %a
%op2 = load volatile <128 x half>, <128 x half>* %b %op2 = load volatile <128 x half>, <128 x half>* %b
%sel = select i1 %mask, <128 x half> %op1, <128 x half> %op2 %sel = select i1 %mask, <128 x half> %op1, <128 x half> %op2
@ -304,15 +135,7 @@ define void @select_v128f16(<128 x half>* %a, <128 x half>* %b, i1 %mask) #0 {
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) #0 { define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) vscale_range(2,0) #0 {
; NO_SVE-LABEL: select_v2f32:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: tst w0, #0x1
; NO_SVE-NEXT: csetm w8, ne
; NO_SVE-NEXT: dup v2.2s, w8
; NO_SVE-NEXT: bif v0.8b, v1.8b, v2.8b
; NO_SVE-NEXT: ret
;
; CHECK-LABEL: select_v2f32: ; CHECK-LABEL: select_v2f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: tst w0, #0x1
@ -325,15 +148,7 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) #
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) #0 { define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) vscale_range(2,0) #0 {
; NO_SVE-LABEL: select_v4f32:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: tst w0, #0x1
; NO_SVE-NEXT: csetm w8, ne
; NO_SVE-NEXT: dup v2.4s, w8
; NO_SVE-NEXT: bif v0.16b, v1.16b, v2.16b
; NO_SVE-NEXT: ret
;
; CHECK-LABEL: select_v4f32: ; CHECK-LABEL: select_v4f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: tst w0, #0x1
@ -345,21 +160,7 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) #
ret <4 x float> %sel ret <4 x float> %sel
} }
define void @select_v8f32(<8 x float>* %a, <8 x float>* %b, i1 %mask) #0 { define void @select_v8f32(<8 x float>* %a, <8 x float>* %b, i1 %mask) vscale_range(2,0) #0 {
; NO_SVE-LABEL: select_v8f32:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: tst w2, #0x1
; NO_SVE-NEXT: ldr q0, [x0]
; NO_SVE-NEXT: csetm w8, ne
; NO_SVE-NEXT: ldr q1, [x0, #16]
; NO_SVE-NEXT: ldr q2, [x1]
; NO_SVE-NEXT: ldr q3, [x1, #16]
; NO_SVE-NEXT: dup v4.4s, w8
; NO_SVE-NEXT: bif v0.16b, v2.16b, v4.16b
; NO_SVE-NEXT: bif v1.16b, v3.16b, v4.16b
; NO_SVE-NEXT: stp q0, q1, [x0]
; NO_SVE-NEXT: ret
;
; CHECK-LABEL: select_v8f32: ; CHECK-LABEL: select_v8f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1 ; CHECK-NEXT: and w8, w2, #0x1
@ -381,26 +182,24 @@ define void @select_v8f32(<8 x float>* %a, <8 x float>* %b, i1 %mask) #0 {
} }
define void @select_v16f32(<16 x float>* %a, <16 x float>* %b, i1 %mask) #0 { define void @select_v16f32(<16 x float>* %a, <16 x float>* %b, i1 %mask) #0 {
; NO_SVE-LABEL: select_v16f32: ; VBITS_GE_256-LABEL: select_v16f32:
; NO_SVE: // %bb.0: ; VBITS_GE_256: // %bb.0:
; NO_SVE-NEXT: tst w2, #0x1 ; VBITS_GE_256-NEXT: mov x8, #8
; NO_SVE-NEXT: ldr q0, [x0, #48] ; VBITS_GE_256-NEXT: and w9, w2, #0x1
; NO_SVE-NEXT: csetm w8, ne ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; NO_SVE-NEXT: ldr q1, [x0] ; VBITS_GE_256-NEXT: ptrue p1.s
; NO_SVE-NEXT: ldr q2, [x0, #16] ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; NO_SVE-NEXT: ldr q3, [x0, #32] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; NO_SVE-NEXT: ldr q4, [x1, #48] ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
; NO_SVE-NEXT: dup v6.4s, w8 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
; NO_SVE-NEXT: ldr q5, [x1] ; VBITS_GE_256-NEXT: mov z4.s, w9
; NO_SVE-NEXT: ldr q7, [x1, #16] ; VBITS_GE_256-NEXT: and z4.s, z4.s, #0x1
; NO_SVE-NEXT: ldr q16, [x1, #32] ; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z4.s, #0
; NO_SVE-NEXT: bif v1.16b, v5.16b, v6.16b ; VBITS_GE_256-NEXT: sel z1.s, p1, z1.s, z3.s
; NO_SVE-NEXT: bif v2.16b, v7.16b, v6.16b ; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z2.s
; NO_SVE-NEXT: bif v0.16b, v4.16b, v6.16b ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; NO_SVE-NEXT: bif v3.16b, v16.16b, v6.16b ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; NO_SVE-NEXT: stp q1, q2, [x0] ; VBITS_GE_256-NEXT: ret
; NO_SVE-NEXT: stp q3, q0, [x0, #32]
; NO_SVE-NEXT: ret
; ;
; VBITS_GE_512-LABEL: select_v16f32: ; VBITS_GE_512-LABEL: select_v16f32:
; VBITS_GE_512: // %bb.0: ; VBITS_GE_512: // %bb.0:
@ -422,58 +221,20 @@ define void @select_v16f32(<16 x float>* %a, <16 x float>* %b, i1 %mask) #0 {
ret void ret void
} }
define void @select_v32f32(<32 x float>* %a, <32 x float>* %b, i1 %mask) #0 { define void @select_v32f32(<32 x float>* %a, <32 x float>* %b, i1 %mask) vscale_range(8,0) #0 {
; NO_SVE-LABEL: select_v32f32: ; CHECK-LABEL: select_v32f32:
; NO_SVE: // %bb.0: ; CHECK: // %bb.0:
; NO_SVE-NEXT: tst w2, #0x1 ; CHECK-NEXT: and w8, w2, #0x1
; NO_SVE-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: ptrue p0.s, vl32
; NO_SVE-NEXT: csetm w8, ne ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; NO_SVE-NEXT: ldr q1, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; NO_SVE-NEXT: ldr q2, [x0, #48] ; CHECK-NEXT: ptrue p1.s
; NO_SVE-NEXT: ldr q3, [x0, #32] ; CHECK-NEXT: mov z2.s, w8
; NO_SVE-NEXT: ldr q4, [x0, #80] ; CHECK-NEXT: and z2.s, z2.s, #0x1
; NO_SVE-NEXT: dup v21.4s, w8 ; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
; NO_SVE-NEXT: ldr q5, [x0, #64] ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
; NO_SVE-NEXT: ldr q6, [x0, #112] ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; NO_SVE-NEXT: ldr q7, [x0, #96] ; CHECK-NEXT: ret
; NO_SVE-NEXT: ldr q16, [x1, #16]
; NO_SVE-NEXT: ldr q17, [x1]
; NO_SVE-NEXT: ldr q18, [x1, #48]
; NO_SVE-NEXT: ldr q19, [x1, #32]
; NO_SVE-NEXT: bif v0.16b, v16.16b, v21.16b
; NO_SVE-NEXT: ldr q20, [x1, #80]
; NO_SVE-NEXT: bif v1.16b, v17.16b, v21.16b
; NO_SVE-NEXT: ldr q16, [x1, #64]
; NO_SVE-NEXT: bif v2.16b, v18.16b, v21.16b
; NO_SVE-NEXT: ldr q17, [x1, #112]
; NO_SVE-NEXT: bif v3.16b, v19.16b, v21.16b
; NO_SVE-NEXT: ldr q18, [x1, #96]
; NO_SVE-NEXT: bif v4.16b, v20.16b, v21.16b
; NO_SVE-NEXT: stp q1, q0, [x0]
; NO_SVE-NEXT: mov v0.16b, v21.16b
; NO_SVE-NEXT: mov v1.16b, v21.16b
; NO_SVE-NEXT: stp q3, q2, [x0, #32]
; NO_SVE-NEXT: mov v2.16b, v21.16b
; NO_SVE-NEXT: bsl v0.16b, v5.16b, v16.16b
; NO_SVE-NEXT: bsl v1.16b, v6.16b, v17.16b
; NO_SVE-NEXT: bsl v2.16b, v7.16b, v18.16b
; NO_SVE-NEXT: stp q0, q4, [x0, #64]
; NO_SVE-NEXT: stp q2, q1, [x0, #96]
; NO_SVE-NEXT: ret
;
; VBITS_GE_1024-LABEL: select_v32f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: and w8, w2, #0x1
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_1024-NEXT: ptrue p1.s
; VBITS_GE_1024-NEXT: mov z2.s, w8
; VBITS_GE_1024-NEXT: and z2.s, z2.s, #0x1
; VBITS_GE_1024-NEXT: cmpne p1.s, p1/z, z2.s, #0
; VBITS_GE_1024-NEXT: sel z0.s, p1, z0.s, z1.s
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
%op1 = load volatile <32 x float>, <32 x float>* %a %op1 = load volatile <32 x float>, <32 x float>* %a
%op2 = load volatile <32 x float>, <32 x float>* %b %op2 = load volatile <32 x float>, <32 x float>* %b
%sel = select i1 %mask, <32 x float> %op1, <32 x float> %op2 %sel = select i1 %mask, <32 x float> %op1, <32 x float> %op2
@ -481,103 +242,20 @@ define void @select_v32f32(<32 x float>* %a, <32 x float>* %b, i1 %mask) #0 {
ret void ret void
} }
define void @select_v64f32(<64 x float>* %a, <64 x float>* %b, i1 %mask) #0 { define void @select_v64f32(<64 x float>* %a, <64 x float>* %b, i1 %mask) vscale_range(16,0) #0 {
; NO_SVE-LABEL: select_v64f32: ; CHECK-LABEL: select_v64f32:
; NO_SVE: // %bb.0: ; CHECK: // %bb.0:
; NO_SVE-NEXT: stp d11, d10, [sp, #-32]! // 16-byte Folded Spill ; CHECK-NEXT: and w8, w2, #0x1
; NO_SVE-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: ptrue p0.s, vl64
; NO_SVE-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; NO_SVE-NEXT: .cfi_offset b8, -8 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; NO_SVE-NEXT: .cfi_offset b9, -16 ; CHECK-NEXT: ptrue p1.s
; NO_SVE-NEXT: .cfi_offset b10, -24 ; CHECK-NEXT: mov z2.s, w8
; NO_SVE-NEXT: .cfi_offset b11, -32 ; CHECK-NEXT: and z2.s, z2.s, #0x1
; NO_SVE-NEXT: tst w2, #0x1 ; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
; NO_SVE-NEXT: ldr q0, [x0, #240] ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
; NO_SVE-NEXT: csetm w8, ne ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; NO_SVE-NEXT: ldr q1, [x0, #224] ; CHECK-NEXT: ret
; NO_SVE-NEXT: ldr q2, [x0, #208]
; NO_SVE-NEXT: ldr q3, [x0, #192]
; NO_SVE-NEXT: ldr q4, [x0, #176]
; NO_SVE-NEXT: dup v8.4s, w8
; NO_SVE-NEXT: ldr q5, [x0, #160]
; NO_SVE-NEXT: ldr q6, [x0, #144]
; NO_SVE-NEXT: ldr q7, [x0, #128]
; NO_SVE-NEXT: ldr q16, [x0, #112]
; NO_SVE-NEXT: ldr q17, [x0, #96]
; NO_SVE-NEXT: ldr q18, [x0, #80]
; NO_SVE-NEXT: ldr q19, [x0, #64]
; NO_SVE-NEXT: ldr q20, [x0, #48]
; NO_SVE-NEXT: ldr q21, [x0, #32]
; NO_SVE-NEXT: ldr q22, [x0, #16]
; NO_SVE-NEXT: ldr q23, [x0]
; NO_SVE-NEXT: ldr q24, [x1, #240]
; NO_SVE-NEXT: ldr q25, [x1, #224]
; NO_SVE-NEXT: ldr q26, [x1, #208]
; NO_SVE-NEXT: ldr q27, [x1, #192]
; NO_SVE-NEXT: bif v0.16b, v24.16b, v8.16b
; NO_SVE-NEXT: ldr q28, [x1, #176]
; NO_SVE-NEXT: bif v1.16b, v25.16b, v8.16b
; NO_SVE-NEXT: ldr q29, [x1, #160]
; NO_SVE-NEXT: bif v2.16b, v26.16b, v8.16b
; NO_SVE-NEXT: ldr q30, [x1, #144]
; NO_SVE-NEXT: bif v3.16b, v27.16b, v8.16b
; NO_SVE-NEXT: ldr q31, [x1, #128]
; NO_SVE-NEXT: ldr q9, [x1, #112]
; NO_SVE-NEXT: ldr q10, [x1, #96]
; NO_SVE-NEXT: bif v4.16b, v28.16b, v8.16b
; NO_SVE-NEXT: ldr q28, [x1, #80]
; NO_SVE-NEXT: ldr q24, [x1, #64]
; NO_SVE-NEXT: ldr q25, [x1, #48]
; NO_SVE-NEXT: ldr q26, [x1, #32]
; NO_SVE-NEXT: ldr q27, [x1, #16]
; NO_SVE-NEXT: ldr q11, [x1]
; NO_SVE-NEXT: stp q3, q2, [x0, #192]
; NO_SVE-NEXT: stp q1, q0, [x0, #224]
; NO_SVE-NEXT: mov v0.16b, v8.16b
; NO_SVE-NEXT: mov v1.16b, v8.16b
; NO_SVE-NEXT: mov v2.16b, v8.16b
; NO_SVE-NEXT: bsl v0.16b, v5.16b, v29.16b
; NO_SVE-NEXT: bsl v1.16b, v6.16b, v30.16b
; NO_SVE-NEXT: bsl v2.16b, v7.16b, v31.16b
; NO_SVE-NEXT: mov v3.16b, v8.16b
; NO_SVE-NEXT: stp q0, q4, [x0, #160]
; NO_SVE-NEXT: mov v4.16b, v8.16b
; NO_SVE-NEXT: mov v0.16b, v8.16b
; NO_SVE-NEXT: stp q2, q1, [x0, #128]
; NO_SVE-NEXT: mov v1.16b, v8.16b
; NO_SVE-NEXT: bsl v3.16b, v16.16b, v9.16b
; NO_SVE-NEXT: bsl v4.16b, v17.16b, v10.16b
; NO_SVE-NEXT: bsl v0.16b, v18.16b, v28.16b
; NO_SVE-NEXT: bsl v1.16b, v19.16b, v24.16b
; NO_SVE-NEXT: mov v2.16b, v8.16b
; NO_SVE-NEXT: stp q4, q3, [x0, #96]
; NO_SVE-NEXT: mov v3.16b, v8.16b
; NO_SVE-NEXT: mov v4.16b, v8.16b
; NO_SVE-NEXT: stp q1, q0, [x0, #64]
; NO_SVE-NEXT: mov v0.16b, v8.16b
; NO_SVE-NEXT: bsl v2.16b, v20.16b, v25.16b
; NO_SVE-NEXT: bsl v3.16b, v21.16b, v26.16b
; NO_SVE-NEXT: bsl v4.16b, v22.16b, v27.16b
; NO_SVE-NEXT: bsl v0.16b, v23.16b, v11.16b
; NO_SVE-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload
; NO_SVE-NEXT: stp q3, q2, [x0, #32]
; NO_SVE-NEXT: stp q0, q4, [x0]
; NO_SVE-NEXT: ldp d11, d10, [sp], #32 // 16-byte Folded Reload
; NO_SVE-NEXT: ret
;
; VBITS_GE_2048-LABEL: select_v64f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: and w8, w2, #0x1
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_2048-NEXT: ptrue p1.s
; VBITS_GE_2048-NEXT: mov z2.s, w8
; VBITS_GE_2048-NEXT: and z2.s, z2.s, #0x1
; VBITS_GE_2048-NEXT: cmpne p1.s, p1/z, z2.s, #0
; VBITS_GE_2048-NEXT: sel z0.s, p1, z0.s, z1.s
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
%op1 = load volatile <64 x float>, <64 x float>* %a %op1 = load volatile <64 x float>, <64 x float>* %a
%op2 = load volatile <64 x float>, <64 x float>* %b %op2 = load volatile <64 x float>, <64 x float>* %b
%sel = select i1 %mask, <64 x float> %op1, <64 x float> %op2 %sel = select i1 %mask, <64 x float> %op1, <64 x float> %op2
@ -586,15 +264,7 @@ define void @select_v64f32(<64 x float>* %a, <64 x float>* %b, i1 %mask) #0 {
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask) #0 { define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask) vscale_range(2,0) #0 {
; NO_SVE-LABEL: select_v1f64:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: tst w0, #0x1
; NO_SVE-NEXT: csetm x8, ne
; NO_SVE-NEXT: fmov d2, x8
; NO_SVE-NEXT: bif v0.8b, v1.8b, v2.8b
; NO_SVE-NEXT: ret
;
; CHECK-LABEL: select_v1f64: ; CHECK-LABEL: select_v1f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: tst w0, #0x1
@ -607,15 +277,7 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask) #0 { define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask) vscale_range(2,0) #0 {
; NO_SVE-LABEL: select_v2f64:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: tst w0, #0x1
; NO_SVE-NEXT: csetm x8, ne
; NO_SVE-NEXT: dup v2.2d, x8
; NO_SVE-NEXT: bif v0.16b, v1.16b, v2.16b
; NO_SVE-NEXT: ret
;
; CHECK-LABEL: select_v2f64: ; CHECK-LABEL: select_v2f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: tst w0, #0x1
@ -627,21 +289,7 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask
ret <2 x double> %sel ret <2 x double> %sel
} }
define void @select_v4f64(<4 x double>* %a, <4 x double>* %b, i1 %mask) #0 { define void @select_v4f64(<4 x double>* %a, <4 x double>* %b, i1 %mask) vscale_range(2,0) #0 {
; NO_SVE-LABEL: select_v4f64:
; NO_SVE: // %bb.0:
; NO_SVE-NEXT: tst w2, #0x1
; NO_SVE-NEXT: ldr q0, [x0]
; NO_SVE-NEXT: csetm x8, ne
; NO_SVE-NEXT: ldr q1, [x0, #16]
; NO_SVE-NEXT: ldr q2, [x1]
; NO_SVE-NEXT: ldr q3, [x1, #16]
; NO_SVE-NEXT: dup v4.2d, x8
; NO_SVE-NEXT: bif v0.16b, v2.16b, v4.16b
; NO_SVE-NEXT: bif v1.16b, v3.16b, v4.16b
; NO_SVE-NEXT: stp q0, q1, [x0]
; NO_SVE-NEXT: ret
;
; CHECK-LABEL: select_v4f64: ; CHECK-LABEL: select_v4f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w2, #0x1 ; CHECK-NEXT: and w8, w2, #0x1
@ -663,26 +311,24 @@ define void @select_v4f64(<4 x double>* %a, <4 x double>* %b, i1 %mask) #0 {
} }
define void @select_v8f64(<8 x double>* %a, <8 x double>* %b, i1 %mask) #0 { define void @select_v8f64(<8 x double>* %a, <8 x double>* %b, i1 %mask) #0 {
; NO_SVE-LABEL: select_v8f64: ; VBITS_GE_256-LABEL: select_v8f64:
; NO_SVE: // %bb.0: ; VBITS_GE_256: // %bb.0:
; NO_SVE-NEXT: tst w2, #0x1 ; VBITS_GE_256-NEXT: mov x8, #4
; NO_SVE-NEXT: ldr q0, [x0, #48] ; VBITS_GE_256-NEXT: and w9, w2, #0x1
; NO_SVE-NEXT: csetm x8, ne ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; NO_SVE-NEXT: ldr q1, [x0] ; VBITS_GE_256-NEXT: ptrue p1.d
; NO_SVE-NEXT: ldr q2, [x0, #16] ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; NO_SVE-NEXT: ldr q3, [x0, #32] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; NO_SVE-NEXT: ldr q4, [x1, #48] ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
; NO_SVE-NEXT: dup v6.2d, x8 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
; NO_SVE-NEXT: ldr q5, [x1] ; VBITS_GE_256-NEXT: mov z4.d, x9
; NO_SVE-NEXT: ldr q7, [x1, #16] ; VBITS_GE_256-NEXT: and z4.d, z4.d, #0x1
; NO_SVE-NEXT: ldr q16, [x1, #32] ; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z4.d, #0
; NO_SVE-NEXT: bif v1.16b, v5.16b, v6.16b ; VBITS_GE_256-NEXT: sel z1.d, p1, z1.d, z3.d
; NO_SVE-NEXT: bif v2.16b, v7.16b, v6.16b ; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z2.d
; NO_SVE-NEXT: bif v0.16b, v4.16b, v6.16b ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; NO_SVE-NEXT: bif v3.16b, v16.16b, v6.16b ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; NO_SVE-NEXT: stp q1, q2, [x0] ; VBITS_GE_256-NEXT: ret
; NO_SVE-NEXT: stp q3, q0, [x0, #32]
; NO_SVE-NEXT: ret
; ;
; VBITS_GE_512-LABEL: select_v8f64: ; VBITS_GE_512-LABEL: select_v8f64:
; VBITS_GE_512: // %bb.0: ; VBITS_GE_512: // %bb.0:
@ -704,58 +350,20 @@ define void @select_v8f64(<8 x double>* %a, <8 x double>* %b, i1 %mask) #0 {
ret void ret void
} }
define void @select_v16f64(<16 x double>* %a, <16 x double>* %b, i1 %mask) #0 { define void @select_v16f64(<16 x double>* %a, <16 x double>* %b, i1 %mask) vscale_range(8,0) #0 {
; NO_SVE-LABEL: select_v16f64: ; CHECK-LABEL: select_v16f64:
; NO_SVE: // %bb.0: ; CHECK: // %bb.0:
; NO_SVE-NEXT: tst w2, #0x1 ; CHECK-NEXT: and w8, w2, #0x1
; NO_SVE-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: ptrue p0.d, vl16
; NO_SVE-NEXT: csetm x8, ne ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; NO_SVE-NEXT: ldr q1, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; NO_SVE-NEXT: ldr q2, [x0, #48] ; CHECK-NEXT: ptrue p1.d
; NO_SVE-NEXT: ldr q3, [x0, #32] ; CHECK-NEXT: mov z2.d, x8
; NO_SVE-NEXT: ldr q4, [x0, #80] ; CHECK-NEXT: and z2.d, z2.d, #0x1
; NO_SVE-NEXT: dup v21.2d, x8 ; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
; NO_SVE-NEXT: ldr q5, [x0, #64] ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
; NO_SVE-NEXT: ldr q6, [x0, #112] ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; NO_SVE-NEXT: ldr q7, [x0, #96] ; CHECK-NEXT: ret
; NO_SVE-NEXT: ldr q16, [x1, #16]
; NO_SVE-NEXT: ldr q17, [x1]
; NO_SVE-NEXT: ldr q18, [x1, #48]
; NO_SVE-NEXT: ldr q19, [x1, #32]
; NO_SVE-NEXT: bif v0.16b, v16.16b, v21.16b
; NO_SVE-NEXT: ldr q20, [x1, #80]
; NO_SVE-NEXT: bif v1.16b, v17.16b, v21.16b
; NO_SVE-NEXT: ldr q16, [x1, #64]
; NO_SVE-NEXT: bif v2.16b, v18.16b, v21.16b
; NO_SVE-NEXT: ldr q17, [x1, #112]
; NO_SVE-NEXT: bif v3.16b, v19.16b, v21.16b
; NO_SVE-NEXT: ldr q18, [x1, #96]
; NO_SVE-NEXT: bif v4.16b, v20.16b, v21.16b
; NO_SVE-NEXT: stp q1, q0, [x0]
; NO_SVE-NEXT: mov v0.16b, v21.16b
; NO_SVE-NEXT: mov v1.16b, v21.16b
; NO_SVE-NEXT: stp q3, q2, [x0, #32]
; NO_SVE-NEXT: mov v2.16b, v21.16b
; NO_SVE-NEXT: bsl v0.16b, v5.16b, v16.16b
; NO_SVE-NEXT: bsl v1.16b, v6.16b, v17.16b
; NO_SVE-NEXT: bsl v2.16b, v7.16b, v18.16b
; NO_SVE-NEXT: stp q0, q4, [x0, #64]
; NO_SVE-NEXT: stp q2, q1, [x0, #96]
; NO_SVE-NEXT: ret
;
; VBITS_GE_1024-LABEL: select_v16f64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: and w8, w2, #0x1
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_1024-NEXT: ptrue p1.d
; VBITS_GE_1024-NEXT: mov z2.d, x8
; VBITS_GE_1024-NEXT: and z2.d, z2.d, #0x1
; VBITS_GE_1024-NEXT: cmpne p1.d, p1/z, z2.d, #0
; VBITS_GE_1024-NEXT: sel z0.d, p1, z0.d, z1.d
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
%op1 = load volatile <16 x double>, <16 x double>* %a %op1 = load volatile <16 x double>, <16 x double>* %a
%op2 = load volatile <16 x double>, <16 x double>* %b %op2 = load volatile <16 x double>, <16 x double>* %b
%sel = select i1 %mask, <16 x double> %op1, <16 x double> %op2 %sel = select i1 %mask, <16 x double> %op1, <16 x double> %op2
@ -763,103 +371,20 @@ define void @select_v16f64(<16 x double>* %a, <16 x double>* %b, i1 %mask) #0 {
ret void ret void
} }
define void @select_v32f64(<32 x double>* %a, <32 x double>* %b, i1 %mask) #0 { define void @select_v32f64(<32 x double>* %a, <32 x double>* %b, i1 %mask) vscale_range(16,0) #0 {
; NO_SVE-LABEL: select_v32f64: ; CHECK-LABEL: select_v32f64:
; NO_SVE: // %bb.0: ; CHECK: // %bb.0:
; NO_SVE-NEXT: stp d11, d10, [sp, #-32]! // 16-byte Folded Spill ; CHECK-NEXT: and w8, w2, #0x1
; NO_SVE-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: ptrue p0.d, vl32
; NO_SVE-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; NO_SVE-NEXT: .cfi_offset b8, -8 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; NO_SVE-NEXT: .cfi_offset b9, -16 ; CHECK-NEXT: ptrue p1.d
; NO_SVE-NEXT: .cfi_offset b10, -24 ; CHECK-NEXT: mov z2.d, x8
; NO_SVE-NEXT: .cfi_offset b11, -32 ; CHECK-NEXT: and z2.d, z2.d, #0x1
; NO_SVE-NEXT: tst w2, #0x1 ; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
; NO_SVE-NEXT: ldr q0, [x0, #240] ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
; NO_SVE-NEXT: csetm x8, ne ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; NO_SVE-NEXT: ldr q1, [x0, #224] ; CHECK-NEXT: ret
; NO_SVE-NEXT: ldr q2, [x0, #208]
; NO_SVE-NEXT: ldr q3, [x0, #192]
; NO_SVE-NEXT: ldr q4, [x0, #176]
; NO_SVE-NEXT: dup v8.2d, x8
; NO_SVE-NEXT: ldr q5, [x0, #160]
; NO_SVE-NEXT: ldr q6, [x0, #144]
; NO_SVE-NEXT: ldr q7, [x0, #128]
; NO_SVE-NEXT: ldr q16, [x0, #112]
; NO_SVE-NEXT: ldr q17, [x0, #96]
; NO_SVE-NEXT: ldr q18, [x0, #80]
; NO_SVE-NEXT: ldr q19, [x0, #64]
; NO_SVE-NEXT: ldr q20, [x0, #48]
; NO_SVE-NEXT: ldr q21, [x0, #32]
; NO_SVE-NEXT: ldr q22, [x0, #16]
; NO_SVE-NEXT: ldr q23, [x0]
; NO_SVE-NEXT: ldr q24, [x1, #240]
; NO_SVE-NEXT: ldr q25, [x1, #224]
; NO_SVE-NEXT: ldr q26, [x1, #208]
; NO_SVE-NEXT: ldr q27, [x1, #192]
; NO_SVE-NEXT: bif v0.16b, v24.16b, v8.16b
; NO_SVE-NEXT: ldr q28, [x1, #176]
; NO_SVE-NEXT: bif v1.16b, v25.16b, v8.16b
; NO_SVE-NEXT: ldr q29, [x1, #160]
; NO_SVE-NEXT: bif v2.16b, v26.16b, v8.16b
; NO_SVE-NEXT: ldr q30, [x1, #144]
; NO_SVE-NEXT: bif v3.16b, v27.16b, v8.16b
; NO_SVE-NEXT: ldr q31, [x1, #128]
; NO_SVE-NEXT: ldr q9, [x1, #112]
; NO_SVE-NEXT: ldr q10, [x1, #96]
; NO_SVE-NEXT: bif v4.16b, v28.16b, v8.16b
; NO_SVE-NEXT: ldr q28, [x1, #80]
; NO_SVE-NEXT: ldr q24, [x1, #64]
; NO_SVE-NEXT: ldr q25, [x1, #48]
; NO_SVE-NEXT: ldr q26, [x1, #32]
; NO_SVE-NEXT: ldr q27, [x1, #16]
; NO_SVE-NEXT: ldr q11, [x1]
; NO_SVE-NEXT: stp q3, q2, [x0, #192]
; NO_SVE-NEXT: stp q1, q0, [x0, #224]
; NO_SVE-NEXT: mov v0.16b, v8.16b
; NO_SVE-NEXT: mov v1.16b, v8.16b
; NO_SVE-NEXT: mov v2.16b, v8.16b
; NO_SVE-NEXT: bsl v0.16b, v5.16b, v29.16b
; NO_SVE-NEXT: bsl v1.16b, v6.16b, v30.16b
; NO_SVE-NEXT: bsl v2.16b, v7.16b, v31.16b
; NO_SVE-NEXT: mov v3.16b, v8.16b
; NO_SVE-NEXT: stp q0, q4, [x0, #160]
; NO_SVE-NEXT: mov v4.16b, v8.16b
; NO_SVE-NEXT: mov v0.16b, v8.16b
; NO_SVE-NEXT: stp q2, q1, [x0, #128]
; NO_SVE-NEXT: mov v1.16b, v8.16b
; NO_SVE-NEXT: bsl v3.16b, v16.16b, v9.16b
; NO_SVE-NEXT: bsl v4.16b, v17.16b, v10.16b
; NO_SVE-NEXT: bsl v0.16b, v18.16b, v28.16b
; NO_SVE-NEXT: bsl v1.16b, v19.16b, v24.16b
; NO_SVE-NEXT: mov v2.16b, v8.16b
; NO_SVE-NEXT: stp q4, q3, [x0, #96]
; NO_SVE-NEXT: mov v3.16b, v8.16b
; NO_SVE-NEXT: mov v4.16b, v8.16b
; NO_SVE-NEXT: stp q1, q0, [x0, #64]
; NO_SVE-NEXT: mov v0.16b, v8.16b
; NO_SVE-NEXT: bsl v2.16b, v20.16b, v25.16b
; NO_SVE-NEXT: bsl v3.16b, v21.16b, v26.16b
; NO_SVE-NEXT: bsl v4.16b, v22.16b, v27.16b
; NO_SVE-NEXT: bsl v0.16b, v23.16b, v11.16b
; NO_SVE-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload
; NO_SVE-NEXT: stp q3, q2, [x0, #32]
; NO_SVE-NEXT: stp q0, q4, [x0]
; NO_SVE-NEXT: ldp d11, d10, [sp], #32 // 16-byte Folded Reload
; NO_SVE-NEXT: ret
;
; VBITS_GE_2048-LABEL: select_v32f64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: and w8, w2, #0x1
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_2048-NEXT: ptrue p1.d
; VBITS_GE_2048-NEXT: mov z2.d, x8
; VBITS_GE_2048-NEXT: and z2.d, z2.d, #0x1
; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z2.d, #0
; VBITS_GE_2048-NEXT: sel z0.d, p1, z0.d, z1.d
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
%op1 = load volatile <32 x double>, <32 x double>* %a %op1 = load volatile <32 x double>, <32 x double>* %a
%op2 = load volatile <32 x double>, <32 x double>* %b %op2 = load volatile <32 x double>, <32 x double>* %b
%sel = select i1 %mask, <32 x double> %op1, <32 x double> %op2 %sel = select i1 %mask, <32 x double> %op1, <32 x double> %op2

File diff suppressed because it is too large Load Diff

View File

@ -1,26 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; Don't use SVE when its registers are no bigger than NEON. ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
target triple = "aarch64-unknown-linux-gnu" target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask) #0 { define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4f16: ; CHECK-LABEL: select_v4f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: shl v2.4h, v2.4h, #15 ; CHECK-NEXT: shl v2.4h, v2.4h, #15
@ -32,7 +18,7 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask) #0 { define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v8f16: ; CHECK-LABEL: select_v8f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ushll v2.8h, v2.8b, #0 ; CHECK-NEXT: ushll v2.8h, v2.8b, #0
@ -44,7 +30,7 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask
ret <8 x half> %sel ret <8 x half> %sel
} }
define void @select_v16f16(<16 x half>* %a, <16 x half>* %b) #0 { define void @select_v16f16(<16 x half>* %a, <16 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v16f16: ; CHECK-LABEL: select_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -96,44 +82,16 @@ define void @select_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
ret void ret void
} }
define void @select_v64f16(<64 x half>* %a, <64 x half>* %b) #0 { define void @select_v64f16(<64 x half>* %a, <64 x half>* %b) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: select_v64f16: ; CHECK-LABEL: select_v64f16:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16 ; CHECK-NEXT: ptrue p0.h, vl64
; VBITS_GE_256-NEXT: mov x9, #48 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #32 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z2.h, z5.h
; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z1.h, z4.h
; VBITS_GE_256-NEXT: fcmeq p3.h, p0/z, z0.h, z6.h
; VBITS_GE_256-NEXT: fcmeq p4.h, p0/z, z3.h, z7.h
; VBITS_GE_256-NEXT: sel z0.h, p3, z0.h, z6.h
; VBITS_GE_256-NEXT: sel z1.h, p2, z1.h, z4.h
; VBITS_GE_256-NEXT: sel z2.h, p1, z2.h, z5.h
; VBITS_GE_256-NEXT: sel z3.h, p4, z3.h, z7.h
; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: select_v64f16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
; VBITS_GE_1024-NEXT: sel z0.h, p1, z0.h, z1.h
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a %op1 = load <64 x half>, <64 x half>* %a
%op2 = load <64 x half>, <64 x half>* %b %op2 = load <64 x half>, <64 x half>* %b
%mask = fcmp oeq <64 x half> %op1, %op2 %mask = fcmp oeq <64 x half> %op1, %op2
@ -142,68 +100,16 @@ define void @select_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
ret void ret void
} }
define void @select_v128f16(<128 x half>* %a, <128 x half>* %b) #0 { define void @select_v128f16(<128 x half>* %a, <128 x half>* %b) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: select_v128f16: ; CHECK-LABEL: select_v128f16:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16 ; CHECK-NEXT: ptrue p0.h, vl128
; VBITS_GE_256-NEXT: mov x9, #48 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #32 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: mov x11, #80 ; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
; VBITS_GE_256-NEXT: mov x12, #64 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; VBITS_GE_256-NEXT: mov x13, #112 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_256-NEXT: mov x14, #96 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x0, x14, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z16.h }, p0/z, [x1, x13, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z17.h }, p0/z, [x1, x14, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z18.h }, p0/z, [x1, x11, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z19.h }, p0/z, [x1, x12, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z20.h }, p0/z, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z21.h }, p0/z, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z22.h }, p0/z, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z23.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z6.h, z17.h
; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z5.h, z16.h
; VBITS_GE_256-NEXT: fcmeq p3.h, p0/z, z4.h, z19.h
; VBITS_GE_256-NEXT: fcmeq p4.h, p0/z, z3.h, z18.h
; VBITS_GE_256-NEXT: fcmeq p5.h, p0/z, z2.h, z21.h
; VBITS_GE_256-NEXT: fcmeq p6.h, p0/z, z1.h, z20.h
; VBITS_GE_256-NEXT: fcmeq p7.h, p0/z, z0.h, z22.h
; VBITS_GE_256-NEXT: fcmeq p8.h, p0/z, z7.h, z23.h
; VBITS_GE_256-NEXT: sel z0.h, p7, z0.h, z22.h
; VBITS_GE_256-NEXT: sel z1.h, p6, z1.h, z20.h
; VBITS_GE_256-NEXT: sel z2.h, p5, z2.h, z21.h
; VBITS_GE_256-NEXT: sel z3.h, p4, z3.h, z18.h
; VBITS_GE_256-NEXT: sel z4.h, p3, z4.h, z19.h
; VBITS_GE_256-NEXT: sel z5.h, p2, z5.h, z16.h
; VBITS_GE_256-NEXT: sel z6.h, p1, z6.h, z17.h
; VBITS_GE_256-NEXT: sel z7.h, p8, z7.h, z23.h
; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x0, x14, lsl #1]
; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x0, x13, lsl #1]
; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x12, lsl #1]
; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x11, lsl #1]
; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: select_v128f16:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
; VBITS_GE_2048-NEXT: sel z0.h, p1, z0.h, z1.h
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
%op1 = load <128 x half>, <128 x half>* %a %op1 = load <128 x half>, <128 x half>* %a
%op2 = load <128 x half>, <128 x half>* %b %op2 = load <128 x half>, <128 x half>* %b
%mask = fcmp oeq <128 x half> %op1, %op2 %mask = fcmp oeq <128 x half> %op1, %op2
@ -213,7 +119,7 @@ define void @select_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %mask) #0 { define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v2f32: ; CHECK-LABEL: select_v2f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: shl v2.2s, v2.2s, #31 ; CHECK-NEXT: shl v2.2s, v2.2s, #31
@ -225,7 +131,7 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %m
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %mask) #0 { define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4f32: ; CHECK-LABEL: select_v4f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ushll v2.4s, v2.4h, #0 ; CHECK-NEXT: ushll v2.4s, v2.4h, #0
@ -237,7 +143,7 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %m
ret <4 x float> %sel ret <4 x float> %sel
} }
define void @select_v8f32(<8 x float>* %a, <8 x float>* %b) #0 { define void @select_v8f32(<8 x float>* %a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v8f32: ; CHECK-LABEL: select_v8f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ptrue p0.s, vl8
@ -289,44 +195,16 @@ define void @select_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
ret void ret void
} }
define void @select_v32f32(<32 x float>* %a, <32 x float>* %b) #0 { define void @select_v32f32(<32 x float>* %a, <32 x float>* %b) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: select_v32f32: ; CHECK-LABEL: select_v32f32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8 ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_256-NEXT: mov x9, #24 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #16 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z2.s, z5.s
; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z4.s
; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z0.s, z6.s
; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z3.s, z7.s
; VBITS_GE_256-NEXT: sel z0.s, p3, z0.s, z6.s
; VBITS_GE_256-NEXT: sel z1.s, p2, z1.s, z4.s
; VBITS_GE_256-NEXT: sel z2.s, p1, z2.s, z5.s
; VBITS_GE_256-NEXT: sel z3.s, p4, z3.s, z7.s
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: select_v32f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_1024-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
; VBITS_GE_1024-NEXT: sel z0.s, p1, z0.s, z1.s
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a %op1 = load <32 x float>, <32 x float>* %a
%op2 = load <32 x float>, <32 x float>* %b %op2 = load <32 x float>, <32 x float>* %b
%mask = fcmp oeq <32 x float> %op1, %op2 %mask = fcmp oeq <32 x float> %op1, %op2
@ -335,68 +213,16 @@ define void @select_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
ret void ret void
} }
define void @select_v64f32(<64 x float>* %a, <64 x float>* %b) #0 { define void @select_v64f32(<64 x float>* %a, <64 x float>* %b) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: select_v64f32: ; CHECK-LABEL: select_v64f32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8 ; CHECK-NEXT: ptrue p0.s, vl64
; VBITS_GE_256-NEXT: mov x9, #24 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #16 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: mov x11, #40 ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
; VBITS_GE_256-NEXT: mov x12, #32 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
; VBITS_GE_256-NEXT: mov x13, #56 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_256-NEXT: mov x14, #48 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1, x14, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z18.s }, p0/z, [x1, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z19.s }, p0/z, [x1, x12, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z20.s }, p0/z, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z21.s }, p0/z, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z22.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z23.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z6.s, z17.s
; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z5.s, z16.s
; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z4.s, z19.s
; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z3.s, z18.s
; VBITS_GE_256-NEXT: fcmeq p5.s, p0/z, z2.s, z21.s
; VBITS_GE_256-NEXT: fcmeq p6.s, p0/z, z1.s, z20.s
; VBITS_GE_256-NEXT: fcmeq p7.s, p0/z, z0.s, z22.s
; VBITS_GE_256-NEXT: fcmeq p8.s, p0/z, z7.s, z23.s
; VBITS_GE_256-NEXT: sel z0.s, p7, z0.s, z22.s
; VBITS_GE_256-NEXT: sel z1.s, p6, z1.s, z20.s
; VBITS_GE_256-NEXT: sel z2.s, p5, z2.s, z21.s
; VBITS_GE_256-NEXT: sel z3.s, p4, z3.s, z18.s
; VBITS_GE_256-NEXT: sel z4.s, p3, z4.s, z19.s
; VBITS_GE_256-NEXT: sel z5.s, p2, z5.s, z16.s
; VBITS_GE_256-NEXT: sel z6.s, p1, z6.s, z17.s
; VBITS_GE_256-NEXT: sel z7.s, p8, z7.s, z23.s
; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x0, x14, lsl #2]
; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x0, x13, lsl #2]
; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: select_v64f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
; VBITS_GE_2048-NEXT: sel z0.s, p1, z0.s, z1.s
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a %op1 = load <64 x float>, <64 x float>* %a
%op2 = load <64 x float>, <64 x float>* %b %op2 = load <64 x float>, <64 x float>* %b
%mask = fcmp oeq <64 x float> %op1, %op2 %mask = fcmp oeq <64 x float> %op1, %op2
@ -406,7 +232,7 @@ define void @select_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1> %mask) #0 { define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v1f64: ; CHECK-LABEL: select_v1f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: tst w0, #0x1
@ -419,7 +245,7 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1>
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1> %mask) #0 { define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1> %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v2f64: ; CHECK-LABEL: select_v2f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ushll v2.2d, v2.2s, #0 ; CHECK-NEXT: ushll v2.2d, v2.2s, #0
@ -431,7 +257,7 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1>
ret <2 x double> %sel ret <2 x double> %sel
} }
define void @select_v4f64(<4 x double>* %a, <4 x double>* %b) #0 { define void @select_v4f64(<4 x double>* %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4f64: ; CHECK-LABEL: select_v4f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ptrue p0.d, vl4
@ -483,44 +309,16 @@ define void @select_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
ret void ret void
} }
define void @select_v16f64(<16 x double>* %a, <16 x double>* %b) #0 { define void @select_v16f64(<16 x double>* %a, <16 x double>* %b) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: select_v16f64: ; CHECK-LABEL: select_v16f64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4 ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_256-NEXT: mov x9, #12 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #8 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z2.d, z5.d
; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, z4.d
; VBITS_GE_256-NEXT: fcmeq p3.d, p0/z, z0.d, z6.d
; VBITS_GE_256-NEXT: fcmeq p4.d, p0/z, z3.d, z7.d
; VBITS_GE_256-NEXT: sel z0.d, p3, z0.d, z6.d
; VBITS_GE_256-NEXT: sel z1.d, p2, z1.d, z4.d
; VBITS_GE_256-NEXT: sel z2.d, p1, z2.d, z5.d
; VBITS_GE_256-NEXT: sel z3.d, p4, z3.d, z7.d
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: select_v16f64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_1024-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
; VBITS_GE_1024-NEXT: sel z0.d, p1, z0.d, z1.d
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a %op1 = load <16 x double>, <16 x double>* %a
%op2 = load <16 x double>, <16 x double>* %b %op2 = load <16 x double>, <16 x double>* %b
%mask = fcmp oeq <16 x double> %op1, %op2 %mask = fcmp oeq <16 x double> %op1, %op2
@ -529,68 +327,16 @@ define void @select_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
ret void ret void
} }
define void @select_v32f64(<32 x double>* %a, <32 x double>* %b) #0 { define void @select_v32f64(<32 x double>* %a, <32 x double>* %b) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: select_v32f64: ; CHECK-LABEL: select_v32f64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4 ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_256-NEXT: mov x9, #12 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #8 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: mov x11, #20 ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
; VBITS_GE_256-NEXT: mov x12, #16 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
; VBITS_GE_256-NEXT: mov x13, #28 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_256-NEXT: mov x14, #24 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1d { z16.d }, p0/z, [x1, x13, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1, x14, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z18.d }, p0/z, [x1, x11, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z19.d }, p0/z, [x1, x12, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z20.d }, p0/z, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z21.d }, p0/z, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z22.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z23.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z6.d, z17.d
; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z5.d, z16.d
; VBITS_GE_256-NEXT: fcmeq p3.d, p0/z, z4.d, z19.d
; VBITS_GE_256-NEXT: fcmeq p4.d, p0/z, z3.d, z18.d
; VBITS_GE_256-NEXT: fcmeq p5.d, p0/z, z2.d, z21.d
; VBITS_GE_256-NEXT: fcmeq p6.d, p0/z, z1.d, z20.d
; VBITS_GE_256-NEXT: fcmeq p7.d, p0/z, z0.d, z22.d
; VBITS_GE_256-NEXT: fcmeq p8.d, p0/z, z7.d, z23.d
; VBITS_GE_256-NEXT: sel z0.d, p7, z0.d, z22.d
; VBITS_GE_256-NEXT: sel z1.d, p6, z1.d, z20.d
; VBITS_GE_256-NEXT: sel z2.d, p5, z2.d, z21.d
; VBITS_GE_256-NEXT: sel z3.d, p4, z3.d, z18.d
; VBITS_GE_256-NEXT: sel z4.d, p3, z4.d, z19.d
; VBITS_GE_256-NEXT: sel z5.d, p2, z5.d, z16.d
; VBITS_GE_256-NEXT: sel z6.d, p1, z6.d, z17.d
; VBITS_GE_256-NEXT: sel z7.d, p8, z7.d, z23.d
; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0, x14, lsl #3]
; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3]
; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: select_v32f64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_2048-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
; VBITS_GE_2048-NEXT: sel z0.d, p1, z0.d, z1.d
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a %op1 = load <32 x double>, <32 x double>* %a
%op2 = load <32 x double>, <32 x double>* %b %op2 = load <32 x double>, <32 x double>* %b
%mask = fcmp oeq <32 x double> %op1, %op2 %mask = fcmp oeq <32 x double> %op1, %op2
@ -599,4 +345,4 @@ define void @select_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
ret void ret void
} }
attributes #0 = { "target-features"="+sve" uwtable } attributes #0 = { "target-features"="+sve" }

View File

@ -1,21 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; Don't use SVE when its registers are no bigger than NEON. ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
target triple = "aarch64-unknown-linux-gnu" target triple = "aarch64-unknown-linux-gnu"
@ -24,49 +10,66 @@ target triple = "aarch64-unknown-linux-gnu"
; ;
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <4 x half> @insertelement_v4f16(<4 x half> %op1) #0 { define <4 x half> @insertelement_v4f16(<4 x half> %op1) vscale_range(2,0) #0 {
; VBITS_GE_256-LABEL: insertelement_v4f16: ; CHECK-LABEL: insertelement_v4f16:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: fmov h1, #5.00000000 ; CHECK-NEXT: fmov h1, #5.00000000
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; VBITS_GE_256-NEXT: mov v0.h[3], v1.h[0] ; CHECK-NEXT: mov v0.h[3], v1.h[0]
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; VBITS_GE_256-NEXT: ret ; CHECK-NEXT: ret
%r = insertelement <4 x half> %op1, half 5.0, i64 3 %r = insertelement <4 x half> %op1, half 5.0, i64 3
ret <4 x half> %r ret <4 x half> %r
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <8 x half> @insertelement_v8f16(<8 x half> %op1) #0 { define <8 x half> @insertelement_v8f16(<8 x half> %op1) vscale_range(2,0) #0 {
; VBITS_GE_256-LABEL: insertelement_v8f16: ; CHECK-LABEL: insertelement_v8f16:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: fmov h1, #5.00000000 ; CHECK-NEXT: fmov h1, #5.00000000
; VBITS_GE_256-NEXT: mov v0.h[7], v1.h[0] ; CHECK-NEXT: mov v0.h[7], v1.h[0]
; VBITS_GE_256-NEXT: ret ; CHECK-NEXT: ret
%r = insertelement <8 x half> %op1, half 5.0, i64 7 %r = insertelement <8 x half> %op1, half 5.0, i64 7
ret <8 x half> %r ret <8 x half> %r
} }
define <16 x half> @insertelement_v16f16(<16 x half>* %a) #0 { define <16 x half> @insertelement_v16f16(<16 x half>* %a) vscale_range(2,0) #0 {
; VBITS_GE_256-LABEL: insertelement_v16f16: ; CHECK-LABEL: insertelement_v16f16:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov w9, #15 ; CHECK-NEXT: mov w9, #15
; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: fmov h2, #5.00000000 ; CHECK-NEXT: fmov h2, #5.00000000
; VBITS_GE_256-NEXT: index z3.h, #0, #1 ; CHECK-NEXT: index z3.h, #0, #1
; VBITS_GE_256-NEXT: ptrue p1.h ; CHECK-NEXT: ptrue p1.h
; VBITS_GE_256-NEXT: mov z1.h, w9 ; CHECK-NEXT: mov z1.h, w9
; VBITS_GE_256-NEXT: cmpeq p1.h, p1/z, z3.h, z1.h ; CHECK-NEXT: cmpeq p1.h, p1/z, z3.h, z1.h
; VBITS_GE_256-NEXT: mov z0.h, p1/m, h2 ; CHECK-NEXT: mov z0.h, p1/m, h2
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8] ; CHECK-NEXT: st1h { z0.h }, p0, [x8]
; VBITS_GE_256-NEXT: ret ; CHECK-NEXT: ret
%op1 = load <16 x half>, <16 x half>* %a %op1 = load <16 x half>, <16 x half>* %a
%r = insertelement <16 x half> %op1, half 5.0, i64 15 %r = insertelement <16 x half> %op1, half 5.0, i64 15
ret <16 x half> %r ret <16 x half> %r
} }
define <32 x half> @insertelement_v32f16(<32 x half>* %a) #0 { define <32 x half> @insertelement_v32f16(<32 x half>* %a) #0 {
; VBITS_GE_256-LABEL: insertelement_v32f16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: mov w10, #15
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: fmov h3, #5.00000000
; VBITS_GE_256-NEXT: index z4.h, #0, #1
; VBITS_GE_256-NEXT: ptrue p1.h
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov z2.h, w10
; VBITS_GE_256-NEXT: cmpeq p1.h, p1/z, z4.h, z2.h
; VBITS_GE_256-NEXT: mov z0.h, p1/m, h3
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x8]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: insertelement_v32f16: ; VBITS_GE_512-LABEL: insertelement_v32f16:
; VBITS_GE_512: // %bb.0: ; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov w9, #31 ; VBITS_GE_512-NEXT: mov w9, #31
@ -85,88 +88,105 @@ define <32 x half> @insertelement_v32f16(<32 x half>* %a) #0 {
ret <32 x half> %r ret <32 x half> %r
} }
define <64 x half> @insertelement_v64f16(<64 x half>* %a) #0 { define <64 x half> @insertelement_v64f16(<64 x half>* %a) vscale_range(8,0) #0 {
; VBITS_GE_1024-LABEL: insertelement_v64f16: ; CHECK-LABEL: insertelement_v64f16:
; VBITS_GE_1024: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: mov w9, #63 ; CHECK-NEXT: mov w9, #63
; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fmov h2, #5.00000000 ; CHECK-NEXT: fmov h2, #5.00000000
; VBITS_GE_1024-NEXT: index z3.h, #0, #1 ; CHECK-NEXT: index z3.h, #0, #1
; VBITS_GE_1024-NEXT: ptrue p1.h ; CHECK-NEXT: ptrue p1.h
; VBITS_GE_1024-NEXT: mov z1.h, w9 ; CHECK-NEXT: mov z1.h, w9
; VBITS_GE_1024-NEXT: cmpeq p1.h, p1/z, z3.h, z1.h ; CHECK-NEXT: cmpeq p1.h, p1/z, z3.h, z1.h
; VBITS_GE_1024-NEXT: mov z0.h, p1/m, h2 ; CHECK-NEXT: mov z0.h, p1/m, h2
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x8] ; CHECK-NEXT: st1h { z0.h }, p0, [x8]
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: ret
%op1 = load <64 x half>, <64 x half>* %a %op1 = load <64 x half>, <64 x half>* %a
%r = insertelement <64 x half> %op1, half 5.0, i64 63 %r = insertelement <64 x half> %op1, half 5.0, i64 63
ret <64 x half> %r ret <64 x half> %r
} }
define <128 x half> @insertelement_v128f16(<128 x half>* %a) #0 { define <128 x half> @insertelement_v128f16(<128 x half>* %a) vscale_range(16,0) #0 {
; VBITS_GE_2048-LABEL: insertelement_v128f16: ; CHECK-LABEL: insertelement_v128f16:
; VBITS_GE_2048: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: mov w9, #127 ; CHECK-NEXT: mov w9, #127
; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fmov h2, #5.00000000 ; CHECK-NEXT: fmov h2, #5.00000000
; VBITS_GE_2048-NEXT: index z3.h, #0, #1 ; CHECK-NEXT: index z3.h, #0, #1
; VBITS_GE_2048-NEXT: ptrue p1.h ; CHECK-NEXT: ptrue p1.h
; VBITS_GE_2048-NEXT: mov z1.h, w9 ; CHECK-NEXT: mov z1.h, w9
; VBITS_GE_2048-NEXT: cmpeq p1.h, p1/z, z3.h, z1.h ; CHECK-NEXT: cmpeq p1.h, p1/z, z3.h, z1.h
; VBITS_GE_2048-NEXT: mov z0.h, p1/m, h2 ; CHECK-NEXT: mov z0.h, p1/m, h2
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x8] ; CHECK-NEXT: st1h { z0.h }, p0, [x8]
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: ret
%op1 = load <128 x half>, <128 x half>* %a %op1 = load <128 x half>, <128 x half>* %a
%r = insertelement <128 x half> %op1, half 5.0, i64 127 %r = insertelement <128 x half> %op1, half 5.0, i64 127
ret <128 x half> %r ret <128 x half> %r
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <2 x float> @insertelement_v2f32(<2 x float> %op1) #0 { define <2 x float> @insertelement_v2f32(<2 x float> %op1) vscale_range(2,0) #0 {
; VBITS_GE_256-LABEL: insertelement_v2f32: ; CHECK-LABEL: insertelement_v2f32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: fmov s1, #5.00000000 ; CHECK-NEXT: fmov s1, #5.00000000
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; VBITS_GE_256-NEXT: mov v0.s[1], v1.s[0] ; CHECK-NEXT: mov v0.s[1], v1.s[0]
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; VBITS_GE_256-NEXT: ret ; CHECK-NEXT: ret
%r = insertelement <2 x float> %op1, float 5.0, i64 1 %r = insertelement <2 x float> %op1, float 5.0, i64 1
ret <2 x float> %r ret <2 x float> %r
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <4 x float> @insertelement_v4f32(<4 x float> %op1) #0 { define <4 x float> @insertelement_v4f32(<4 x float> %op1) vscale_range(2,0) #0 {
; VBITS_GE_256-LABEL: insertelement_v4f32: ; CHECK-LABEL: insertelement_v4f32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: fmov s1, #5.00000000 ; CHECK-NEXT: fmov s1, #5.00000000
; VBITS_GE_256-NEXT: mov v0.s[3], v1.s[0] ; CHECK-NEXT: mov v0.s[3], v1.s[0]
; VBITS_GE_256-NEXT: ret ; CHECK-NEXT: ret
%r = insertelement <4 x float> %op1, float 5.0, i64 3 %r = insertelement <4 x float> %op1, float 5.0, i64 3
ret <4 x float> %r ret <4 x float> %r
} }
define <8 x float> @insertelement_v8f32(<8 x float>* %a) #0 { define <8 x float> @insertelement_v8f32(<8 x float>* %a) vscale_range(2,0) #0 {
; VBITS_GE_256-LABEL: insertelement_v8f32: ; CHECK-LABEL: insertelement_v8f32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov w9, #7 ; CHECK-NEXT: mov w9, #7
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: fmov s2, #5.00000000 ; CHECK-NEXT: fmov s2, #5.00000000
; VBITS_GE_256-NEXT: index z3.s, #0, #1 ; CHECK-NEXT: index z3.s, #0, #1
; VBITS_GE_256-NEXT: ptrue p1.s ; CHECK-NEXT: ptrue p1.s
; VBITS_GE_256-NEXT: mov z1.s, w9 ; CHECK-NEXT: mov z1.s, w9
; VBITS_GE_256-NEXT: cmpeq p1.s, p1/z, z3.s, z1.s ; CHECK-NEXT: cmpeq p1.s, p1/z, z3.s, z1.s
; VBITS_GE_256-NEXT: mov z0.s, p1/m, s2 ; CHECK-NEXT: mov z0.s, p1/m, s2
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8] ; CHECK-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_256-NEXT: ret ; CHECK-NEXT: ret
%op1 = load <8 x float>, <8 x float>* %a %op1 = load <8 x float>, <8 x float>* %a
%r = insertelement <8 x float> %op1, float 5.0, i64 7 %r = insertelement <8 x float> %op1, float 5.0, i64 7
ret <8 x float> %r ret <8 x float> %r
} }
define <16 x float> @insertelement_v16f32(<16 x float>* %a) #0 { define <16 x float> @insertelement_v16f32(<16 x float>* %a) #0 {
; VBITS_GE_256-LABEL: insertelement_v16f32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: mov w10, #7
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: fmov s3, #5.00000000
; VBITS_GE_256-NEXT: index z4.s, #0, #1
; VBITS_GE_256-NEXT: ptrue p1.s
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov z2.s, w10
; VBITS_GE_256-NEXT: cmpeq p1.s, p1/z, z4.s, z2.s
; VBITS_GE_256-NEXT: mov z0.s, p1/m, s3
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: insertelement_v16f32: ; VBITS_GE_512-LABEL: insertelement_v16f32:
; VBITS_GE_512: // %bb.0: ; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov w9, #15 ; VBITS_GE_512-NEXT: mov w9, #15
@ -185,86 +205,103 @@ define <16 x float> @insertelement_v16f32(<16 x float>* %a) #0 {
ret <16 x float> %r ret <16 x float> %r
} }
define <32 x float> @insertelement_v32f32(<32 x float>* %a) #0 { define <32 x float> @insertelement_v32f32(<32 x float>* %a) vscale_range(8,0) #0 {
; VBITS_GE_1024-LABEL: insertelement_v32f32: ; CHECK-LABEL: insertelement_v32f32:
; VBITS_GE_1024: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: mov w9, #31 ; CHECK-NEXT: mov w9, #31
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fmov s2, #5.00000000 ; CHECK-NEXT: fmov s2, #5.00000000
; VBITS_GE_1024-NEXT: index z3.s, #0, #1 ; CHECK-NEXT: index z3.s, #0, #1
; VBITS_GE_1024-NEXT: ptrue p1.s ; CHECK-NEXT: ptrue p1.s
; VBITS_GE_1024-NEXT: mov z1.s, w9 ; CHECK-NEXT: mov z1.s, w9
; VBITS_GE_1024-NEXT: cmpeq p1.s, p1/z, z3.s, z1.s ; CHECK-NEXT: cmpeq p1.s, p1/z, z3.s, z1.s
; VBITS_GE_1024-NEXT: mov z0.s, p1/m, s2 ; CHECK-NEXT: mov z0.s, p1/m, s2
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8] ; CHECK-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a %op1 = load <32 x float>, <32 x float>* %a
%r = insertelement <32 x float> %op1, float 5.0, i64 31 %r = insertelement <32 x float> %op1, float 5.0, i64 31
ret <32 x float> %r ret <32 x float> %r
} }
define <64 x float> @insertelement_v64f32(<64 x float>* %a) #0 { define <64 x float> @insertelement_v64f32(<64 x float>* %a) vscale_range(16,0) #0 {
; VBITS_GE_2048-LABEL: insertelement_v64f32: ; CHECK-LABEL: insertelement_v64f32:
; VBITS_GE_2048: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: mov w9, #63 ; CHECK-NEXT: mov w9, #63
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fmov s2, #5.00000000 ; CHECK-NEXT: fmov s2, #5.00000000
; VBITS_GE_2048-NEXT: index z3.s, #0, #1 ; CHECK-NEXT: index z3.s, #0, #1
; VBITS_GE_2048-NEXT: ptrue p1.s ; CHECK-NEXT: ptrue p1.s
; VBITS_GE_2048-NEXT: mov z1.s, w9 ; CHECK-NEXT: mov z1.s, w9
; VBITS_GE_2048-NEXT: cmpeq p1.s, p1/z, z3.s, z1.s ; CHECK-NEXT: cmpeq p1.s, p1/z, z3.s, z1.s
; VBITS_GE_2048-NEXT: mov z0.s, p1/m, s2 ; CHECK-NEXT: mov z0.s, p1/m, s2
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] ; CHECK-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a %op1 = load <64 x float>, <64 x float>* %a
%r = insertelement <64 x float> %op1, float 5.0, i64 63 %r = insertelement <64 x float> %op1, float 5.0, i64 63
ret <64 x float> %r ret <64 x float> %r
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <1 x double> @insertelement_v1f64(<1 x double> %op1) #0 { define <1 x double> @insertelement_v1f64(<1 x double> %op1) vscale_range(2,0) #0 {
; VBITS_GE_256-LABEL: insertelement_v1f64: ; CHECK-LABEL: insertelement_v1f64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4617315517961601024 ; CHECK-NEXT: mov x8, #4617315517961601024
; VBITS_GE_256-NEXT: fmov d0, x8 ; CHECK-NEXT: fmov d0, x8
; VBITS_GE_256-NEXT: ret ; CHECK-NEXT: ret
%r = insertelement <1 x double> %op1, double 5.0, i64 0 %r = insertelement <1 x double> %op1, double 5.0, i64 0
ret <1 x double> %r ret <1 x double> %r
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <2 x double> @insertelement_v2f64(<2 x double> %op1) #0 { define <2 x double> @insertelement_v2f64(<2 x double> %op1) vscale_range(2,0) #0 {
; VBITS_GE_256-LABEL: insertelement_v2f64: ; CHECK-LABEL: insertelement_v2f64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: fmov d1, #5.00000000 ; CHECK-NEXT: fmov d1, #5.00000000
; VBITS_GE_256-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: mov v0.d[1], v1.d[0]
; VBITS_GE_256-NEXT: ret ; CHECK-NEXT: ret
%r = insertelement <2 x double> %op1, double 5.0, i64 1 %r = insertelement <2 x double> %op1, double 5.0, i64 1
ret <2 x double> %r ret <2 x double> %r
} }
define <4 x double> @insertelement_v4f64(<4 x double>* %a) #0 { define <4 x double> @insertelement_v4f64(<4 x double>* %a) vscale_range(2,0) #0 {
; VBITS_GE_256-LABEL: insertelement_v4f64: ; CHECK-LABEL: insertelement_v4f64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov w9, #3 ; CHECK-NEXT: mov w9, #3
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: fmov d2, #5.00000000 ; CHECK-NEXT: fmov d2, #5.00000000
; VBITS_GE_256-NEXT: index z3.d, #0, #1 ; CHECK-NEXT: index z3.d, #0, #1
; VBITS_GE_256-NEXT: ptrue p1.d ; CHECK-NEXT: ptrue p1.d
; VBITS_GE_256-NEXT: mov z1.d, x9 ; CHECK-NEXT: mov z1.d, x9
; VBITS_GE_256-NEXT: cmpeq p1.d, p1/z, z3.d, z1.d ; CHECK-NEXT: cmpeq p1.d, p1/z, z3.d, z1.d
; VBITS_GE_256-NEXT: mov z0.d, p1/m, d2 ; CHECK-NEXT: mov z0.d, p1/m, d2
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8] ; CHECK-NEXT: st1d { z0.d }, p0, [x8]
; VBITS_GE_256-NEXT: ret ; CHECK-NEXT: ret
%op1 = load <4 x double>, <4 x double>* %a %op1 = load <4 x double>, <4 x double>* %a
%r = insertelement <4 x double> %op1, double 5.0, i64 3 %r = insertelement <4 x double> %op1, double 5.0, i64 3
ret <4 x double> %r ret <4 x double> %r
} }
define <8 x double> @insertelement_v8f64(<8 x double>* %a) #0 { define <8 x double> @insertelement_v8f64(<8 x double>* %a) #0 {
; VBITS_GE_256-LABEL: insertelement_v8f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x9, #4
; VBITS_GE_256-NEXT: mov w10, #3
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: fmov d3, #5.00000000
; VBITS_GE_256-NEXT: index z4.d, #0, #1
; VBITS_GE_256-NEXT: ptrue p1.d
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov z2.d, x10
; VBITS_GE_256-NEXT: cmpeq p1.d, p1/z, z4.d, z2.d
; VBITS_GE_256-NEXT: mov z0.d, p1/m, d3
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: insertelement_v8f64: ; VBITS_GE_512-LABEL: insertelement_v8f64:
; VBITS_GE_512: // %bb.0: ; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov w9, #7 ; VBITS_GE_512-NEXT: mov w9, #7
@ -283,39 +320,39 @@ define <8 x double> @insertelement_v8f64(<8 x double>* %a) #0 {
ret <8 x double> %r ret <8 x double> %r
} }
define <16 x double> @insertelement_v16f64(<16 x double>* %a) #0 { define <16 x double> @insertelement_v16f64(<16 x double>* %a) vscale_range(8,0) #0 {
; VBITS_GE_1024-LABEL: insertelement_v16f64: ; CHECK-LABEL: insertelement_v16f64:
; VBITS_GE_1024: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: mov w9, #15 ; CHECK-NEXT: mov w9, #15
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fmov d2, #5.00000000 ; CHECK-NEXT: fmov d2, #5.00000000
; VBITS_GE_1024-NEXT: index z3.d, #0, #1 ; CHECK-NEXT: index z3.d, #0, #1
; VBITS_GE_1024-NEXT: ptrue p1.d ; CHECK-NEXT: ptrue p1.d
; VBITS_GE_1024-NEXT: mov z1.d, x9 ; CHECK-NEXT: mov z1.d, x9
; VBITS_GE_1024-NEXT: cmpeq p1.d, p1/z, z3.d, z1.d ; CHECK-NEXT: cmpeq p1.d, p1/z, z3.d, z1.d
; VBITS_GE_1024-NEXT: mov z0.d, p1/m, d2 ; CHECK-NEXT: mov z0.d, p1/m, d2
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8] ; CHECK-NEXT: st1d { z0.d }, p0, [x8]
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a %op1 = load <16 x double>, <16 x double>* %a
%r = insertelement <16 x double> %op1, double 5.0, i64 15 %r = insertelement <16 x double> %op1, double 5.0, i64 15
ret <16 x double> %r ret <16 x double> %r
} }
define <32 x double> @insertelement_v32f64(<32 x double>* %a) #0 { define <32 x double> @insertelement_v32f64(<32 x double>* %a) vscale_range(16,0) #0 {
; VBITS_GE_2048-LABEL: insertelement_v32f64: ; CHECK-LABEL: insertelement_v32f64:
; VBITS_GE_2048: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: mov w9, #31 ; CHECK-NEXT: mov w9, #31
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fmov d2, #5.00000000 ; CHECK-NEXT: fmov d2, #5.00000000
; VBITS_GE_2048-NEXT: index z3.d, #0, #1 ; CHECK-NEXT: index z3.d, #0, #1
; VBITS_GE_2048-NEXT: ptrue p1.d ; CHECK-NEXT: ptrue p1.d
; VBITS_GE_2048-NEXT: mov z1.d, x9 ; CHECK-NEXT: mov z1.d, x9
; VBITS_GE_2048-NEXT: cmpeq p1.d, p1/z, z3.d, z1.d ; CHECK-NEXT: cmpeq p1.d, p1/z, z3.d, z1.d
; VBITS_GE_2048-NEXT: mov z0.d, p1/m, d2 ; CHECK-NEXT: mov z0.d, p1/m, d2
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] ; CHECK-NEXT: st1d { z0.d }, p0, [x8]
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a %op1 = load <32 x double>, <32 x double>* %a
%r = insertelement <32 x double> %op1, double 5.0, i64 31 %r = insertelement <32 x double> %op1, double 5.0, i64 31
ret <32 x double> %r ret <32 x double> %r

File diff suppressed because it is too large Load Diff

View File

@ -1,58 +1,46 @@
; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
target triple = "aarch64-unknown-linux-gnu" target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE when its registers are no bigger than NEON.
; NO_SVE-NOT: z{0-9}
; ;
; ICMP EQ ; ICMP EQ
; ;
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <8 x i8> @icmp_eq_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { define <8 x i8> @icmp_eq_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v8i8: ; CHECK-LABEL: icmp_eq_v8i8:
; CHECK: cmeq v0.8b, v0.8b, v1.8b ; CHECK: // %bb.0:
; CHECK-NEXT: ret ; CHECK-NEXT: cmeq v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
%cmp = icmp eq <8 x i8> %op1, %op2 %cmp = icmp eq <8 x i8> %op1, %op2
%sext = sext <8 x i1> %cmp to <8 x i8> %sext = sext <8 x i1> %cmp to <8 x i8>
ret <8 x i8> %sext ret <8 x i8> %sext
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <16 x i8> @icmp_eq_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { define <16 x i8> @icmp_eq_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v16i8: ; CHECK-LABEL: icmp_eq_v16i8:
; CHECK: cmeq v0.16b, v0.16b, v1.16b ; CHECK: // %bb.0:
; CHECK-NEXT: ret ; CHECK-NEXT: cmeq v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%cmp = icmp eq <16 x i8> %op1, %op2 %cmp = icmp eq <16 x i8> %op1, %op2
%sext = sext <16 x i1> %cmp to <16 x i8> %sext = sext <16 x i1> %cmp to <16 x i8>
ret <16 x i8> %sext ret <16 x i8> %sext
} }
define void @icmp_eq_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { define void @icmp_eq_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v32i8: ; CHECK-LABEL: icmp_eq_v32i8:
; CHECK: ptrue [[PG:p[0-9]+]].b, vl32 ; CHECK: // %bb.0:
; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.b, vl32
; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: cmpeq [[CMP:p[0-9]+]].b, [[PG]]/z, [[OP1]].b, [[OP2]].b ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: mov [[SEXT:z[0-9]+]].b, [[CMP]]/z, #-1 ; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
; CHECK-NEXT: st1b { [[SEXT]].b }, [[PG]], [x0] ; CHECK-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: ret ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a %op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b %op2 = load <32 x i8>, <32 x i8>* %b
%cmp = icmp eq <32 x i8> %op1, %op2 %cmp = icmp eq <32 x i8> %op1, %op2
@ -62,29 +50,31 @@ define void @icmp_eq_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
} }
define void @icmp_eq_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { define void @icmp_eq_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
; CHECK-LABEL: icmp_eq_v64i8: ; VBITS_GE_256-LABEL: icmp_eq_v64i8:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov w8, #32
; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_512-NEXT: cmpeq [[CMP:p[0-9]+]].b, [[PG]]/z, [[OP1]].b, [[OP2]].b ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].b, [[CMP]]/z, #-1 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1b { [[SEXT]].b }, [[PG]], [x0] ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z0.b, z2.b
; Ensure sensible type legalisation ; VBITS_GE_256-NEXT: cmpeq p2.b, p0/z, z1.b, z3.b
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32 ; VBITS_GE_256-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff
; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32 ; VBITS_GE_256-NEXT: mov z1.b, p2/z, #-1 // =0xffffffffffffffff
; VBITS_EQ_256-DAG: ld1b { [[OP1_LO:z[0-9]+]].b }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]] ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1] ; VBITS_GE_256-NEXT: ret
; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1, x[[NUMELTS]]] ;
; VBITS_EQ_256-DAG: cmpeq [[CMP_LO:p[0-9]+]].b, [[PG]]/z, [[OP1_LO]].b, [[OP2_LO]].b ; VBITS_GE_512-LABEL: icmp_eq_v64i8:
; VBITS_EQ_256-DAG: cmpeq [[CMP_HI:p[0-9]+]].b, [[PG]]/z, [[OP1_HI]].b, [[OP2_HI]].b ; VBITS_GE_512: // %bb.0:
; VBITS_EQ_256-DAG: mov [[SEXT_LO:z[0-9]+]].b, [[CMP_LO]]/z, #-1 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
; VBITS_EQ_256-DAG: mov [[SEXT_HI:z[0-9]+]].b, [[CMP_HI]]/z, #-1 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_EQ_256-DAG: st1b { [[SEXT_LO]].b }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
; VBITS_EQ_256-DAG: st1b { [[SEXT_HI]].b }, [[PG]], [x0, x[[NUMELTS]]] ; VBITS_GE_512-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
; VBITS_EQ_256-NEXT: ret ; VBITS_GE_512-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a %op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b %op2 = load <64 x i8>, <64 x i8>* %b
%cmp = icmp eq <64 x i8> %op1, %op2 %cmp = icmp eq <64 x i8> %op1, %op2
@ -93,15 +83,16 @@ define void @icmp_eq_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
ret void ret void
} }
define void @icmp_eq_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { define void @icmp_eq_v128i8(<128 x i8>* %a, <128 x i8>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: icmp_eq_v128i8: ; CHECK-LABEL: icmp_eq_v128i8:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128 ; CHECK: // %bb.0:
; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.b, vl128
; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_1024-NEXT: cmpeq [[CMP:p[0-9]+]].b, [[PG]]/z, [[OP1]].b, [[OP2]].b ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; VBITS_GE_1024-NEXT: mov [[SEXT:z[0-9]+]].b, [[CMP]]/z, #-1 ; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
; VBITS_GE_1024-NEXT: st1b { [[SEXT]].b }, [[PG]], [x0] ; CHECK-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a %op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b %op2 = load <128 x i8>, <128 x i8>* %b
%cmp = icmp eq <128 x i8> %op1, %op2 %cmp = icmp eq <128 x i8> %op1, %op2
@ -110,15 +101,16 @@ define void @icmp_eq_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
ret void ret void
} }
define void @icmp_eq_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { define void @icmp_eq_v256i8(<256 x i8>* %a, <256 x i8>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: icmp_eq_v256i8: ; CHECK-LABEL: icmp_eq_v256i8:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256 ; CHECK: // %bb.0:
; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.b, vl256
; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].b, [[PG]]/z, [[OP1]].b, [[OP2]].b ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; VBITS_GE_2048-NEXT: mov [[SEXT:z[0-9]+]].b, [[CMP]]/z, #-1 ; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
; VBITS_GE_2048-NEXT: st1b { [[SEXT]].b }, [[PG]], [x0] ; CHECK-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a %op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b %op2 = load <256 x i8>, <256 x i8>* %b
%cmp = icmp eq <256 x i8> %op1, %op2 %cmp = icmp eq <256 x i8> %op1, %op2
@ -128,34 +120,37 @@ define void @icmp_eq_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <4 x i16> @icmp_eq_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { define <4 x i16> @icmp_eq_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v4i16: ; CHECK-LABEL: icmp_eq_v4i16:
; CHECK: cmeq v0.4h, v0.4h, v1.4h ; CHECK: // %bb.0:
; CHECK-NEXT: ret ; CHECK-NEXT: cmeq v0.4h, v0.4h, v1.4h
; CHECK-NEXT: ret
%cmp = icmp eq <4 x i16> %op1, %op2 %cmp = icmp eq <4 x i16> %op1, %op2
%sext = sext <4 x i1> %cmp to <4 x i16> %sext = sext <4 x i1> %cmp to <4 x i16>
ret <4 x i16> %sext ret <4 x i16> %sext
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <8 x i16> @icmp_eq_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { define <8 x i16> @icmp_eq_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v8i16: ; CHECK-LABEL: icmp_eq_v8i16:
; CHECK: cmeq v0.8h, v0.8h, v1.8h ; CHECK: // %bb.0:
; CHECK-NEXT: ret ; CHECK-NEXT: cmeq v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%cmp = icmp eq <8 x i16> %op1, %op2 %cmp = icmp eq <8 x i16> %op1, %op2
%sext = sext <8 x i1> %cmp to <8 x i16> %sext = sext <8 x i1> %cmp to <8 x i16>
ret <8 x i16> %sext ret <8 x i16> %sext
} }
define void @icmp_eq_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { define void @icmp_eq_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v16i16: ; CHECK-LABEL: icmp_eq_v16i16:
; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 ; CHECK: // %bb.0:
; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: cmpeq [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x0] ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: ret ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a %op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b %op2 = load <16 x i16>, <16 x i16>* %b
%cmp = icmp eq <16 x i16> %op1, %op2 %cmp = icmp eq <16 x i16> %op1, %op2
@ -165,29 +160,31 @@ define void @icmp_eq_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
} }
define void @icmp_eq_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { define void @icmp_eq_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
; CHECK-LABEL: icmp_eq_v32i16: ; VBITS_GE_256-LABEL: icmp_eq_v32i16:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_512-NEXT: cmpeq [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1h { [[SEXT]].h }, [[PG]], [x0] ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, z2.h
; Ensure sensible type legalisation ; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z1.h, z3.h
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 ; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16 ; VBITS_GE_256-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff
; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1] ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1] ; VBITS_GE_256-NEXT: ret
; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1] ;
; VBITS_EQ_256-DAG: cmpeq [[CMP_LO:p[0-9]+]].h, [[PG]]/z, [[OP1_LO]].h, [[OP2_LO]].h ; VBITS_GE_512-LABEL: icmp_eq_v32i16:
; VBITS_EQ_256-DAG: cmpeq [[CMP_HI:p[0-9]+]].h, [[PG]]/z, [[OP1_HI]].h, [[OP2_HI]].h ; VBITS_GE_512: // %bb.0:
; VBITS_EQ_256-DAG: mov [[SEXT_LO:z[0-9]+]].h, [[CMP_LO]]/z, #-1 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_EQ_256-DAG: mov [[SEXT_HI:z[0-9]+]].h, [[CMP_HI]]/z, #-1 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_EQ_256-DAG: st1h { [[SEXT_LO]].h }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_EQ_256-DAG: st1h { [[SEXT_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1] ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
; VBITS_EQ_256-NEXT: ret ; VBITS_GE_512-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a %op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b %op2 = load <32 x i16>, <32 x i16>* %b
%cmp = icmp eq <32 x i16> %op1, %op2 %cmp = icmp eq <32 x i16> %op1, %op2
@ -196,15 +193,16 @@ define void @icmp_eq_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
ret void ret void
} }
define void @icmp_eq_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { define void @icmp_eq_v64i16(<64 x i16>* %a, <64 x i16>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: icmp_eq_v64i16: ; CHECK-LABEL: icmp_eq_v64i16:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 ; CHECK: // %bb.0:
; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: cmpeq [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_1024-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
; VBITS_GE_1024-NEXT: st1h { [[SEXT]].h }, [[PG]], [x0] ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a %op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b %op2 = load <64 x i16>, <64 x i16>* %b
%cmp = icmp eq <64 x i16> %op1, %op2 %cmp = icmp eq <64 x i16> %op1, %op2
@ -213,15 +211,16 @@ define void @icmp_eq_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
ret void ret void
} }
define void @icmp_eq_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { define void @icmp_eq_v128i16(<128 x i16>* %a, <128 x i16>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: icmp_eq_v128i16: ; CHECK-LABEL: icmp_eq_v128i16:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 ; CHECK: // %bb.0:
; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_2048-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
; VBITS_GE_2048-NEXT: st1h { [[SEXT]].h }, [[PG]], [x0] ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a %op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b %op2 = load <128 x i16>, <128 x i16>* %b
%cmp = icmp eq <128 x i16> %op1, %op2 %cmp = icmp eq <128 x i16> %op1, %op2
@ -231,34 +230,37 @@ define void @icmp_eq_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <2 x i32> @icmp_eq_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { define <2 x i32> @icmp_eq_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v2i32: ; CHECK-LABEL: icmp_eq_v2i32:
; CHECK: cmeq v0.2s, v0.2s, v1.2s ; CHECK: // %bb.0:
; CHECK-NEXT: ret ; CHECK-NEXT: cmeq v0.2s, v0.2s, v1.2s
; CHECK-NEXT: ret
%cmp = icmp eq <2 x i32> %op1, %op2 %cmp = icmp eq <2 x i32> %op1, %op2
%sext = sext <2 x i1> %cmp to <2 x i32> %sext = sext <2 x i1> %cmp to <2 x i32>
ret <2 x i32> %sext ret <2 x i32> %sext
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <4 x i32> @icmp_eq_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { define <4 x i32> @icmp_eq_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v4i32: ; CHECK-LABEL: icmp_eq_v4i32:
; CHECK: cmeq v0.4s, v0.4s, v1.4s ; CHECK: // %bb.0:
; CHECK-NEXT: ret ; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
%cmp = icmp eq <4 x i32> %op1, %op2 %cmp = icmp eq <4 x i32> %op1, %op2
%sext = sext <4 x i1> %cmp to <4 x i32> %sext = sext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %sext ret <4 x i32> %sext
} }
define void @icmp_eq_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { define void @icmp_eq_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v8i32: ; CHECK-LABEL: icmp_eq_v8i32:
; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 ; CHECK: // %bb.0:
; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP1]].s, [[OP2]].s ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1 ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
; CHECK-NEXT: st1w { [[SEXT]].s }, [[PG]], [x0] ; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: ret ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a %op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b %op2 = load <8 x i32>, <8 x i32>* %b
%cmp = icmp eq <8 x i32> %op1, %op2 %cmp = icmp eq <8 x i32> %op1, %op2
@ -268,29 +270,31 @@ define void @icmp_eq_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
} }
define void @icmp_eq_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { define void @icmp_eq_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
; CHECK-LABEL: icmp_eq_v16i32: ; VBITS_GE_256-LABEL: icmp_eq_v16i32:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_512-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP1]].s, [[OP2]].s ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1w { [[SEXT]].s }, [[PG]], [x0] ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z2.s
; Ensure sensible type legalisation ; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, z3.s
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 ; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8 ; VBITS_GE_256-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff
; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2] ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1] ; VBITS_GE_256-NEXT: ret
; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2] ;
; VBITS_EQ_256-DAG: cmpeq [[CMP_LO:p[0-9]+]].s, [[PG]]/z, [[OP1_LO]].s, [[OP2_LO]].s ; VBITS_GE_512-LABEL: icmp_eq_v16i32:
; VBITS_EQ_256-DAG: cmpeq [[CMP_HI:p[0-9]+]].s, [[PG]]/z, [[OP1_HI]].s, [[OP2_HI]].s ; VBITS_GE_512: // %bb.0:
; VBITS_EQ_256-DAG: mov [[SEXT_LO:z[0-9]+]].s, [[CMP_LO]]/z, #-1 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_EQ_256-DAG: mov [[SEXT_HI:z[0-9]+]].s, [[CMP_HI]]/z, #-1 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_EQ_256-DAG: st1w { [[SEXT_LO]].s }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_EQ_256-DAG: st1w { [[SEXT_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2] ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
; VBITS_EQ_256-NEXT: ret ; VBITS_GE_512-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a %op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b %op2 = load <16 x i32>, <16 x i32>* %b
%cmp = icmp eq <16 x i32> %op1, %op2 %cmp = icmp eq <16 x i32> %op1, %op2
@ -299,15 +303,16 @@ define void @icmp_eq_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
ret void ret void
} }
define void @icmp_eq_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { define void @icmp_eq_v32i32(<32 x i32>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: icmp_eq_v32i32: ; CHECK-LABEL: icmp_eq_v32i32:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 ; CHECK: // %bb.0:
; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP1]].s, [[OP2]].s ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_1024-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1 ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
; VBITS_GE_1024-NEXT: st1w { [[SEXT]].s }, [[PG]], [x0] ; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a %op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b %op2 = load <32 x i32>, <32 x i32>* %b
%cmp = icmp eq <32 x i32> %op1, %op2 %cmp = icmp eq <32 x i32> %op1, %op2
@ -316,15 +321,16 @@ define void @icmp_eq_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
ret void ret void
} }
define void @icmp_eq_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { define void @icmp_eq_v64i32(<64 x i32>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: icmp_eq_v64i32: ; CHECK-LABEL: icmp_eq_v64i32:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 ; CHECK: // %bb.0:
; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP1]].s, [[OP2]].s ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_2048-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1 ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
; VBITS_GE_2048-NEXT: st1w { [[SEXT]].s }, [[PG]], [x0] ; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a %op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b %op2 = load <64 x i32>, <64 x i32>* %b
%cmp = icmp eq <64 x i32> %op1, %op2 %cmp = icmp eq <64 x i32> %op1, %op2
@ -334,34 +340,37 @@ define void @icmp_eq_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <1 x i64> @icmp_eq_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { define <1 x i64> @icmp_eq_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v1i64: ; CHECK-LABEL: icmp_eq_v1i64:
; CHECK: cmeq d0, d0, d1 ; CHECK: // %bb.0:
; CHECK-NEXT: ret ; CHECK-NEXT: cmeq d0, d0, d1
; CHECK-NEXT: ret
%cmp = icmp eq <1 x i64> %op1, %op2 %cmp = icmp eq <1 x i64> %op1, %op2
%sext = sext <1 x i1> %cmp to <1 x i64> %sext = sext <1 x i1> %cmp to <1 x i64>
ret <1 x i64> %sext ret <1 x i64> %sext
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <2 x i64> @icmp_eq_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { define <2 x i64> @icmp_eq_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v2i64: ; CHECK-LABEL: icmp_eq_v2i64:
; CHECK: cmeq v0.2d, v0.2d, v1.2d ; CHECK: // %bb.0:
; CHECK-NEXT: ret ; CHECK-NEXT: cmeq v0.2d, v0.2d, v1.2d
; CHECK-NEXT: ret
%cmp = icmp eq <2 x i64> %op1, %op2 %cmp = icmp eq <2 x i64> %op1, %op2
%sext = sext <2 x i1> %cmp to <2 x i64> %sext = sext <2 x i1> %cmp to <2 x i64>
ret <2 x i64> %sext ret <2 x i64> %sext
} }
define void @icmp_eq_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { define void @icmp_eq_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_eq_v4i64: ; CHECK-LABEL: icmp_eq_v4i64:
; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 ; CHECK: // %bb.0:
; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: cmpeq [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1 ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
; CHECK-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0] ; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: ret ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a %op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b %op2 = load <4 x i64>, <4 x i64>* %b
%cmp = icmp eq <4 x i64> %op1, %op2 %cmp = icmp eq <4 x i64> %op1, %op2
@ -371,29 +380,31 @@ define void @icmp_eq_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
} }
define void @icmp_eq_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { define void @icmp_eq_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
; CHECK-LABEL: icmp_eq_v8i64: ; VBITS_GE_256-LABEL: icmp_eq_v8i64:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_512-NEXT: cmpeq [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0] ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d
; Ensure sensible type legalisation ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, z3.d
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 ; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 ; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff
; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3] ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1] ; VBITS_GE_256-NEXT: ret
; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3] ;
; VBITS_EQ_256-DAG: cmpeq [[CMP_LO:p[0-9]+]].d, [[PG]]/z, [[OP1_LO]].d, [[OP2_LO]].d ; VBITS_GE_512-LABEL: icmp_eq_v8i64:
; VBITS_EQ_256-DAG: cmpeq [[CMP_HI:p[0-9]+]].d, [[PG]]/z, [[OP1_HI]].d, [[OP2_HI]].d ; VBITS_GE_512: // %bb.0:
; VBITS_EQ_256-DAG: mov [[SEXT_LO:z[0-9]+]].d, [[CMP_LO]]/z, #-1 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_EQ_256-DAG: mov [[SEXT_HI:z[0-9]+]].d, [[CMP_HI]]/z, #-1 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_EQ_256-DAG: st1d { [[SEXT_LO]].d }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_EQ_256-DAG: st1d { [[SEXT_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
; VBITS_EQ_256-NEXT: ret ; VBITS_GE_512-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a %op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b %op2 = load <8 x i64>, <8 x i64>* %b
%cmp = icmp eq <8 x i64> %op1, %op2 %cmp = icmp eq <8 x i64> %op1, %op2
@ -402,15 +413,16 @@ define void @icmp_eq_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
ret void ret void
} }
define void @icmp_eq_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { define void @icmp_eq_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: icmp_eq_v16i64: ; CHECK-LABEL: icmp_eq_v16i64:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 ; CHECK: // %bb.0:
; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: cmpeq [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_1024-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1 ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
; VBITS_GE_1024-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0] ; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a %op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b %op2 = load <16 x i64>, <16 x i64>* %b
%cmp = icmp eq <16 x i64> %op1, %op2 %cmp = icmp eq <16 x i64> %op1, %op2
@ -419,15 +431,16 @@ define void @icmp_eq_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
ret void ret void
} }
define void @icmp_eq_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { define void @icmp_eq_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: icmp_eq_v32i64: ; CHECK-LABEL: icmp_eq_v32i64:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 ; CHECK: // %bb.0:
; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_2048-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1 ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
; VBITS_GE_2048-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0] ; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a %op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b %op2 = load <32 x i64>, <32 x i64>* %b
%cmp = icmp eq <32 x i64> %op1, %op2 %cmp = icmp eq <32 x i64> %op1, %op2
@ -440,15 +453,16 @@ define void @icmp_eq_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
; ICMP NE ; ICMP NE
; ;
define void @icmp_ne_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { define void @icmp_ne_v32i8(<32 x i8>* %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_ne_v32i8: ; CHECK-LABEL: icmp_ne_v32i8:
; CHECK: ptrue [[PG:p[0-9]+]].b, vl32 ; CHECK: // %bb.0:
; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.b, vl32
; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: cmpne [[CMP:p[0-9]+]].b, [[PG]]/z, [[OP1]].b, [[OP2]].b ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: mov [[SEXT:z[0-9]+]].b, [[CMP]]/z, #-1 ; CHECK-NEXT: cmpne p1.b, p0/z, z0.b, z1.b
; CHECK-NEXT: st1b { [[SEXT]].b }, [[PG]], [x0] ; CHECK-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: ret ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a %op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b %op2 = load <32 x i8>, <32 x i8>* %b
%cmp = icmp ne <32 x i8> %op1, %op2 %cmp = icmp ne <32 x i8> %op1, %op2
@ -461,15 +475,16 @@ define void @icmp_ne_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
; ICMP SGE ; ICMP SGE
; ;
define void @icmp_sge_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { define void @icmp_sge_v32i16(<32 x i16>* %a, <32 x i16>* %b) vscale_range(4,0) #0 {
; CHECK-LABEL: icmp_sge_v32i16: ; CHECK-LABEL: icmp_sge_v32i16:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 ; CHECK: // %bb.0:
; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: cmpge [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 ; CHECK-NEXT: cmpge p1.h, p0/z, z0.h, z1.h
; VBITS_GE_512-NEXT: st1h { [[SEXT]].h }, [[PG]], [x0] ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_512-NEXT: ret ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a %op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b %op2 = load <32 x i16>, <32 x i16>* %b
%cmp = icmp sge <32 x i16> %op1, %op2 %cmp = icmp sge <32 x i16> %op1, %op2
@ -482,15 +497,16 @@ define void @icmp_sge_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
; ICMP SGT ; ICMP SGT
; ;
define void @icmp_sgt_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { define void @icmp_sgt_v16i16(<16 x i16>* %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_sgt_v16i16: ; CHECK-LABEL: icmp_sgt_v16i16:
; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 ; CHECK: // %bb.0:
; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: cmpgt [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 ; CHECK-NEXT: cmpgt p1.h, p0/z, z0.h, z1.h
; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x0] ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: ret ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a %op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b %op2 = load <16 x i16>, <16 x i16>* %b
%cmp = icmp sgt <16 x i16> %op1, %op2 %cmp = icmp sgt <16 x i16> %op1, %op2
@ -503,15 +519,16 @@ define void @icmp_sgt_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
; ICMP SLE ; ICMP SLE
; ;
define void @icmp_sle_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { define void @icmp_sle_v16i32(<16 x i32>* %a, <16 x i32>* %b) vscale_range(4,0) #0 {
; CHECK-LABEL: icmp_sle_v16i32: ; CHECK-LABEL: icmp_sle_v16i32:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 ; CHECK: // %bb.0:
; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: cmpge [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP2]].s, [[OP1]].s ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1 ; CHECK-NEXT: cmpge p1.s, p0/z, z1.s, z0.s
; VBITS_GE_512-NEXT: st1w { [[SEXT]].s }, [[PG]], [x0] ; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_512-NEXT: ret ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a %op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b %op2 = load <16 x i32>, <16 x i32>* %b
%cmp = icmp sle <16 x i32> %op1, %op2 %cmp = icmp sle <16 x i32> %op1, %op2
@ -524,15 +541,16 @@ define void @icmp_sle_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
; ICMP SLT ; ICMP SLT
; ;
define void @icmp_slt_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { define void @icmp_slt_v8i32(<8 x i32>* %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_slt_v8i32: ; CHECK-LABEL: icmp_slt_v8i32:
; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 ; CHECK: // %bb.0:
; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: cmpgt [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP2]].s, [[OP1]].s ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1 ; CHECK-NEXT: cmpgt p1.s, p0/z, z1.s, z0.s
; CHECK-NEXT: st1w { [[SEXT]].s }, [[PG]], [x0] ; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: ret ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a %op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b %op2 = load <8 x i32>, <8 x i32>* %b
%cmp = icmp slt <8 x i32> %op1, %op2 %cmp = icmp slt <8 x i32> %op1, %op2
@ -545,15 +563,16 @@ define void @icmp_slt_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
; ICMP UGE ; ICMP UGE
; ;
define void @icmp_uge_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { define void @icmp_uge_v8i64(<8 x i64>* %a, <8 x i64>* %b) vscale_range(4,0) #0 {
; CHECK-LABEL: icmp_uge_v8i64: ; CHECK-LABEL: icmp_uge_v8i64:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 ; CHECK: // %bb.0:
; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: cmphs [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1 ; CHECK-NEXT: cmphs p1.d, p0/z, z0.d, z1.d
; VBITS_GE_512-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0] ; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_512-NEXT: ret ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a %op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b %op2 = load <8 x i64>, <8 x i64>* %b
%cmp = icmp uge <8 x i64> %op1, %op2 %cmp = icmp uge <8 x i64> %op1, %op2
@ -566,15 +585,16 @@ define void @icmp_uge_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
; ICMP UGT ; ICMP UGT
; ;
define void @icmp_ugt_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { define void @icmp_ugt_v4i64(<4 x i64>* %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: icmp_ugt_v4i64: ; CHECK-LABEL: icmp_ugt_v4i64:
; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 ; CHECK: // %bb.0:
; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: cmphi [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1 ; CHECK-NEXT: cmphi p1.d, p0/z, z0.d, z1.d
; CHECK-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0] ; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: ret ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a %op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b %op2 = load <4 x i64>, <4 x i64>* %b
%cmp = icmp ugt <4 x i64> %op1, %op2 %cmp = icmp ugt <4 x i64> %op1, %op2
@ -587,15 +607,16 @@ define void @icmp_ugt_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
; ICMP ULE ; ICMP ULE
; ;
define void @icmp_ule_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { define void @icmp_ule_v16i64(<16 x i64>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; CHECK-LABEL: icmp_ule_v16i64: ; CHECK-LABEL: icmp_ule_v16i64:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 ; CHECK: // %bb.0:
; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: cmphs [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP2]].d, [[OP1]].d ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_1024-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1 ; CHECK-NEXT: cmphs p1.d, p0/z, z1.d, z0.d
; VBITS_GE_1024-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0] ; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a %op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b %op2 = load <16 x i64>, <16 x i64>* %b
%cmp = icmp ule <16 x i64> %op1, %op2 %cmp = icmp ule <16 x i64> %op1, %op2
@ -608,15 +629,16 @@ define void @icmp_ule_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
; ICMP ULT ; ICMP ULT
; ;
define void @icmp_ult_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { define void @icmp_ult_v32i64(<32 x i64>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; CHECK-LABEL: icmp_ult_v32i64: ; CHECK-LABEL: icmp_ult_v32i64:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 ; CHECK: // %bb.0:
; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: cmphi [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP2]].d, [[OP1]].d ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_2048-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1 ; CHECK-NEXT: cmphi p1.d, p0/z, z1.d, z0.d
; VBITS_GE_2048-NEXT: st1d { [[SEXT]].d }, [[PG]], [x0] ; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a %op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b %op2 = load <32 x i64>, <32 x i64>* %b
%cmp = icmp ult <32 x i64> %op1, %op2 %cmp = icmp ult <32 x i64> %op1, %op2

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,62 +1,50 @@
; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
target triple = "aarch64-unknown-linux-gnu" target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE when its registers are no bigger than NEON.
; NO_SVE-NOT: ptrue
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) #0 { define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v8i8: ; CHECK-LABEL: select_v8i8:
; CHECK: tst w0, #0x1 ; CHECK: // %bb.0:
; CHECK-NEXT: csetm w8, ne ; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: dup v2.8b, w8 ; CHECK-NEXT: csetm w8, ne
; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: dup v2.8b, w8
; CHECK-NEXT: ret ; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
; CHECK-NEXT: ret
%sel = select i1 %mask, <8 x i8> %op1, <8 x i8> %op2 %sel = select i1 %mask, <8 x i8> %op1, <8 x i8> %op2
ret <8 x i8> %sel ret <8 x i8> %sel
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) #0 { define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v16i8: ; CHECK-LABEL: select_v16i8:
; CHECK: tst w0, #0x1 ; CHECK: // %bb.0:
; CHECK-NEXT: csetm w8, ne ; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: dup v2.16b, w8 ; CHECK-NEXT: csetm w8, ne
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: dup v2.16b, w8
; CHECK-NEXT: ret ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
; CHECK-NEXT: ret
%sel = select i1 %mask, <16 x i8> %op1, <16 x i8> %op2 %sel = select i1 %mask, <16 x i8> %op1, <16 x i8> %op2
ret <16 x i8> %sel ret <16 x i8> %sel
} }
define void @select_v32i8(<32 x i8>* %a, <32 x i8>* %b, i1 %mask) #0 { define void @select_v32i8(<32 x i8>* %a, <32 x i8>* %b, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v32i8: ; CHECK-LABEL: select_v32i8:
; CHECK: and w[[AND:[0-9]+]], w2, #0x1 ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,32)]] ; CHECK-NEXT: and w8, w2, #0x1
; CHECK-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; CHECK-NEXT: ptrue p0.b, vl32
; CHECK-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].b ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: mov [[TMP1:z[0-9]+]].b, w[[AND]] ; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: and [[TMP2:z[0-9]+]].b, [[TMP1]].b, #0x1 ; CHECK-NEXT: mov z2.b, w8
; CHECK-NEXT: cmpne [[PRES:p[0-9]+]].b, [[PG2]]/z, [[TMP2]].b, #0 ; CHECK-NEXT: and z2.b, z2.b, #0x1
; CHECK-NEXT: sel [[RES:z[0-9]+]].b, [[PRES]], [[OP1]].b, [[OP2]].b ; CHECK-NEXT: cmpne p1.b, p1/z, z2.b, #0
; CHECK-NEXT: st1b { [[RES]].b }, [[PG1]], [x0] ; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b
; CHECK-NEXT: ret ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <32 x i8>, <32 x i8>* %a %op1 = load volatile <32 x i8>, <32 x i8>* %a
%op2 = load volatile <32 x i8>, <32 x i8>* %b %op2 = load volatile <32 x i8>, <32 x i8>* %b
%sel = select i1 %mask, <32 x i8> %op1, <32 x i8> %op2 %sel = select i1 %mask, <32 x i8> %op1, <32 x i8> %op2
@ -65,18 +53,38 @@ define void @select_v32i8(<32 x i8>* %a, <32 x i8>* %b, i1 %mask) #0 {
} }
define void @select_v64i8(<64 x i8>* %a, <64 x i8>* %b, i1 %mask) #0 { define void @select_v64i8(<64 x i8>* %a, <64 x i8>* %b, i1 %mask) #0 {
; CHECK-LABEL: select_v64i8: ; VBITS_GE_256-LABEL: select_v64i8:
; VBITS_GE_512: and w[[AND:[0-9]+]], w2, #0x1 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,64)]] ; VBITS_GE_256-NEXT: mov w8, #32
; VBITS_GE_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_GE_256-NEXT: and w9, w2, #0x1
; VBITS_GE_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].b ; VBITS_GE_256-NEXT: ptrue p1.b
; VBITS_GE_512-NEXT: mov [[TMP1:z[0-9]+]].b, w[[AND]] ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
; VBITS_GE_512-NEXT: and [[TMP2:z[0-9]+]].b, [[TMP1]].b, #0x1 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
; VBITS_GE_512-NEXT: cmpne [[PRES:p[0-9]+]].b, [[PG2]]/z, [[TMP2]].b, #0 ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].b, [[PRES]], [[OP1]].b, [[OP2]].b ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG1]], [x0] ; VBITS_GE_256-NEXT: mov z4.b, w9
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: and z4.b, z4.b, #0x1
; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z4.b, #0
; VBITS_GE_256-NEXT: sel z1.b, p1, z1.b, z3.b
; VBITS_GE_256-NEXT: sel z0.b, p1, z0.b, z2.b
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: select_v64i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: and w8, w2, #0x1
; VBITS_GE_512-NEXT: ptrue p0.b, vl64
; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
; VBITS_GE_512-NEXT: ptrue p1.b
; VBITS_GE_512-NEXT: mov z2.b, w8
; VBITS_GE_512-NEXT: and z2.b, z2.b, #0x1
; VBITS_GE_512-NEXT: cmpne p1.b, p1/z, z2.b, #0
; VBITS_GE_512-NEXT: sel z0.b, p1, z0.b, z1.b
; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load volatile <64 x i8>, <64 x i8>* %a %op1 = load volatile <64 x i8>, <64 x i8>* %a
%op2 = load volatile <64 x i8>, <64 x i8>* %b %op2 = load volatile <64 x i8>, <64 x i8>* %b
%sel = select i1 %mask, <64 x i8> %op1, <64 x i8> %op2 %sel = select i1 %mask, <64 x i8> %op1, <64 x i8> %op2
@ -84,19 +92,20 @@ define void @select_v64i8(<64 x i8>* %a, <64 x i8>* %b, i1 %mask) #0 {
ret void ret void
} }
define void @select_v128i8(<128 x i8>* %a, <128 x i8>* %b, i1 %mask) #0 { define void @select_v128i8(<128 x i8>* %a, <128 x i8>* %b, i1 %mask) vscale_range(8,0) #0 {
; CHECK-LABEL: select_v128i8: ; CHECK-LABEL: select_v128i8:
; VBITS_GE_1024: and w[[AND:[0-9]+]], w2, #0x1 ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,128)]] ; CHECK-NEXT: and w8, w2, #0x1
; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; CHECK-NEXT: ptrue p0.b, vl128
; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].b ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; VBITS_GE_1024-NEXT: mov [[TMP1:z[0-9]+]].b, w[[AND]] ; CHECK-NEXT: ptrue p1.b
; VBITS_GE_1024-NEXT: and [[TMP2:z[0-9]+]].b, [[TMP1]].b, #0x1 ; CHECK-NEXT: mov z2.b, w8
; VBITS_GE_1024-NEXT: cmpne [[PRES:p[0-9]+]].b, [[PG2]]/z, [[TMP2]].b, #0 ; CHECK-NEXT: and z2.b, z2.b, #0x1
; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].b, [[PRES]], [[OP1]].b, [[OP2]].b ; CHECK-NEXT: cmpne p1.b, p1/z, z2.b, #0
; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG1]], [x0] ; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <128 x i8>, <128 x i8>* %a %op1 = load volatile <128 x i8>, <128 x i8>* %a
%op2 = load volatile <128 x i8>, <128 x i8>* %b %op2 = load volatile <128 x i8>, <128 x i8>* %b
%sel = select i1 %mask, <128 x i8> %op1, <128 x i8> %op2 %sel = select i1 %mask, <128 x i8> %op1, <128 x i8> %op2
@ -104,19 +113,20 @@ define void @select_v128i8(<128 x i8>* %a, <128 x i8>* %b, i1 %mask) #0 {
ret void ret void
} }
define void @select_v256i8(<256 x i8>* %a, <256 x i8>* %b, i1 %mask) #0 { define void @select_v256i8(<256 x i8>* %a, <256 x i8>* %b, i1 %mask) vscale_range(16,0) #0 {
; CHECK-LABEL: select_v256i8: ; CHECK-LABEL: select_v256i8:
; VBITS_GE_2048: and w[[AND:[0-9]+]], w2, #0x1 ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,256)]] ; CHECK-NEXT: and w8, w2, #0x1
; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; CHECK-NEXT: ptrue p0.b, vl256
; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].b ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; VBITS_GE_2048-NEXT: mov [[TMP1:z[0-9]+]].b, w[[AND]] ; CHECK-NEXT: ptrue p1.b
; VBITS_GE_2048-NEXT: and [[TMP2:z[0-9]+]].b, [[TMP1]].b, #0x1 ; CHECK-NEXT: mov z2.b, w8
; VBITS_GE_2048-NEXT: cmpne [[PRES:p[0-9]+]].b, [[PG2]]/z, [[TMP2]].b, #0 ; CHECK-NEXT: and z2.b, z2.b, #0x1
; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].b, [[PRES]], [[OP1]].b, [[OP2]].b ; CHECK-NEXT: cmpne p1.b, p1/z, z2.b, #0
; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG1]], [x0] ; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <256 x i8>, <256 x i8>* %a %op1 = load volatile <256 x i8>, <256 x i8>* %a
%op2 = load volatile <256 x i8>, <256 x i8>* %b %op2 = load volatile <256 x i8>, <256 x i8>* %b
%sel = select i1 %mask, <256 x i8> %op1, <256 x i8> %op2 %sel = select i1 %mask, <256 x i8> %op1, <256 x i8> %op2
@ -125,42 +135,45 @@ define void @select_v256i8(<256 x i8>* %a, <256 x i8>* %b, i1 %mask) #0 {
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) #0 { define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4i16: ; CHECK-LABEL: select_v4i16:
; CHECK: tst w0, #0x1 ; CHECK: // %bb.0:
; CHECK-NEXT: csetm w8, ne ; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: dup v2.4h, w8 ; CHECK-NEXT: csetm w8, ne
; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: dup v2.4h, w8
; CHECK-NEXT: ret ; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
; CHECK-NEXT: ret
%sel = select i1 %mask, <4 x i16> %op1, <4 x i16> %op2 %sel = select i1 %mask, <4 x i16> %op1, <4 x i16> %op2
ret <4 x i16> %sel ret <4 x i16> %sel
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) #0 { define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v8i16: ; CHECK-LABEL: select_v8i16:
; CHECK: tst w0, #0x1 ; CHECK: // %bb.0:
; CHECK-NEXT: csetm w8, ne ; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: dup v2.8h, w8 ; CHECK-NEXT: csetm w8, ne
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: dup v2.8h, w8
; CHECK-NEXT: ret ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
; CHECK-NEXT: ret
%sel = select i1 %mask, <8 x i16> %op1, <8 x i16> %op2 %sel = select i1 %mask, <8 x i16> %op1, <8 x i16> %op2
ret <8 x i16> %sel ret <8 x i16> %sel
} }
define void @select_v16i16(<16 x i16>* %a, <16 x i16>* %b, i1 %mask) #0 { define void @select_v16i16(<16 x i16>* %a, <16 x i16>* %b, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v16i16: ; CHECK-LABEL: select_v16i16:
; CHECK: and w[[AND:[0-9]+]], w2, #0x1 ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]] ; CHECK-NEXT: and w8, w2, #0x1
; CHECK-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].h ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: mov [[TMP1:z[0-9]+]].h, w[[AND]] ; CHECK-NEXT: ptrue p1.h
; CHECK-NEXT: and [[TMP2:z[0-9]+]].h, [[TMP1]].h, #0x1 ; CHECK-NEXT: mov z2.h, w8
; CHECK-NEXT: cmpne [[PRES:p[0-9]+]].h, [[PG2]]/z, [[TMP2]].h, #0 ; CHECK-NEXT: and z2.h, z2.h, #0x1
; CHECK-NEXT: sel [[RES:z[0-9]+]].h, [[PRES]], [[OP1]].h, [[OP2]].h ; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
; CHECK-NEXT: st1h { [[RES]].h }, [[PG1]], [x0] ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; CHECK-NEXT: ret ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <16 x i16>, <16 x i16>* %a %op1 = load volatile <16 x i16>, <16 x i16>* %a
%op2 = load volatile <16 x i16>, <16 x i16>* %b %op2 = load volatile <16 x i16>, <16 x i16>* %b
%sel = select i1 %mask, <16 x i16> %op1, <16 x i16> %op2 %sel = select i1 %mask, <16 x i16> %op1, <16 x i16> %op2
@ -169,18 +182,38 @@ define void @select_v16i16(<16 x i16>* %a, <16 x i16>* %b, i1 %mask) #0 {
} }
define void @select_v32i16(<32 x i16>* %a, <32 x i16>* %b, i1 %mask) #0 { define void @select_v32i16(<32 x i16>* %a, <32 x i16>* %b, i1 %mask) #0 {
; CHECK-LABEL: select_v32i16: ; VBITS_GE_256-LABEL: select_v32i16:
; VBITS_GE_512: and w[[AND:[0-9]+]], w2, #0x1 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]] ; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_GE_256-NEXT: and w9, w2, #0x1
; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].h ; VBITS_GE_256-NEXT: ptrue p1.h
; VBITS_GE_512-NEXT: mov [[TMP1:z[0-9]+]].h, w[[AND]] ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_512-NEXT: and [[TMP2:z[0-9]+]].h, [[TMP1]].h, #0x1 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: cmpne [[PRES:p[0-9]+]].h, [[PG2]]/z, [[TMP2]].h, #0 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].h, [[PRES]], [[OP1]].h, [[OP2]].h ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG1]], [x0] ; VBITS_GE_256-NEXT: mov z4.h, w9
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: and z4.h, z4.h, #0x1
; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z4.h, #0
; VBITS_GE_256-NEXT: sel z1.h, p1, z1.h, z3.h
; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z2.h
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: select_v32i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: and w8, w2, #0x1
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_512-NEXT: ptrue p1.h
; VBITS_GE_512-NEXT: mov z2.h, w8
; VBITS_GE_512-NEXT: and z2.h, z2.h, #0x1
; VBITS_GE_512-NEXT: cmpne p1.h, p1/z, z2.h, #0
; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load volatile <32 x i16>, <32 x i16>* %a %op1 = load volatile <32 x i16>, <32 x i16>* %a
%op2 = load volatile <32 x i16>, <32 x i16>* %b %op2 = load volatile <32 x i16>, <32 x i16>* %b
%sel = select i1 %mask, <32 x i16> %op1, <32 x i16> %op2 %sel = select i1 %mask, <32 x i16> %op1, <32 x i16> %op2
@ -188,19 +221,20 @@ define void @select_v32i16(<32 x i16>* %a, <32 x i16>* %b, i1 %mask) #0 {
ret void ret void
} }
define void @select_v64i16(<64 x i16>* %a, <64 x i16>* %b, i1 %mask) #0 { define void @select_v64i16(<64 x i16>* %a, <64 x i16>* %b, i1 %mask) vscale_range(8,0) #0 {
; CHECK-LABEL: select_v64i16: ; CHECK-LABEL: select_v64i16:
; VBITS_GE_1024: and w[[AND:[0-9]+]], w2, #0x1 ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]] ; CHECK-NEXT: and w8, w2, #0x1
; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].h ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_1024-NEXT: mov [[TMP1:z[0-9]+]].h, w[[AND]] ; CHECK-NEXT: ptrue p1.h
; VBITS_GE_1024-NEXT: and [[TMP2:z[0-9]+]].h, [[TMP1]].h, #0x1 ; CHECK-NEXT: mov z2.h, w8
; VBITS_GE_1024-NEXT: cmpne [[PRES:p[0-9]+]].h, [[PG2]]/z, [[TMP2]].h, #0 ; CHECK-NEXT: and z2.h, z2.h, #0x1
; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].h, [[PRES]], [[OP1]].h, [[OP2]].h ; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG1]], [x0] ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <64 x i16>, <64 x i16>* %a %op1 = load volatile <64 x i16>, <64 x i16>* %a
%op2 = load volatile <64 x i16>, <64 x i16>* %b %op2 = load volatile <64 x i16>, <64 x i16>* %b
%sel = select i1 %mask, <64 x i16> %op1, <64 x i16> %op2 %sel = select i1 %mask, <64 x i16> %op1, <64 x i16> %op2
@ -208,19 +242,20 @@ define void @select_v64i16(<64 x i16>* %a, <64 x i16>* %b, i1 %mask) #0 {
ret void ret void
} }
define void @select_v128i16(<128 x i16>* %a, <128 x i16>* %b, i1 %mask) #0 { define void @select_v128i16(<128 x i16>* %a, <128 x i16>* %b, i1 %mask) vscale_range(16,0) #0 {
; CHECK-LABEL: select_v128i16: ; CHECK-LABEL: select_v128i16:
; VBITS_GE_2048: and w[[AND:[0-9]+]], w2, #0x1 ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]] ; CHECK-NEXT: and w8, w2, #0x1
; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].h ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_2048-NEXT: mov [[TMP1:z[0-9]+]].h, w[[AND]] ; CHECK-NEXT: ptrue p1.h
; VBITS_GE_2048-NEXT: and [[TMP2:z[0-9]+]].h, [[TMP1]].h, #0x1 ; CHECK-NEXT: mov z2.h, w8
; VBITS_GE_2048-NEXT: cmpne [[PRES:p[0-9]+]].h, [[PG2]]/z, [[TMP2]].h, #0 ; CHECK-NEXT: and z2.h, z2.h, #0x1
; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].h, [[PRES]], [[OP1]].h, [[OP2]].h ; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0
; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG1]], [x0] ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <128 x i16>, <128 x i16>* %a %op1 = load volatile <128 x i16>, <128 x i16>* %a
%op2 = load volatile <128 x i16>, <128 x i16>* %b %op2 = load volatile <128 x i16>, <128 x i16>* %b
%sel = select i1 %mask, <128 x i16> %op1, <128 x i16> %op2 %sel = select i1 %mask, <128 x i16> %op1, <128 x i16> %op2
@ -229,42 +264,45 @@ define void @select_v128i16(<128 x i16>* %a, <128 x i16>* %b, i1 %mask) #0 {
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) #0 { define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v2i32: ; CHECK-LABEL: select_v2i32:
; CHECK: tst w0, #0x1 ; CHECK: // %bb.0:
; CHECK-NEXT: csetm w8, ne ; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: dup v2.2s, w8 ; CHECK-NEXT: csetm w8, ne
; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: dup v2.2s, w8
; CHECK-NEXT: ret ; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
; CHECK-NEXT: ret
%sel = select i1 %mask, <2 x i32> %op1, <2 x i32> %op2 %sel = select i1 %mask, <2 x i32> %op1, <2 x i32> %op2
ret <2 x i32> %sel ret <2 x i32> %sel
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) #0 { define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4i32: ; CHECK-LABEL: select_v4i32:
; CHECK: tst w0, #0x1 ; CHECK: // %bb.0:
; CHECK-NEXT: csetm w8, ne ; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: dup v2.4s, w8 ; CHECK-NEXT: csetm w8, ne
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: dup v2.4s, w8
; CHECK-NEXT: ret ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
; CHECK-NEXT: ret
%sel = select i1 %mask, <4 x i32> %op1, <4 x i32> %op2 %sel = select i1 %mask, <4 x i32> %op1, <4 x i32> %op2
ret <4 x i32> %sel ret <4 x i32> %sel
} }
define void @select_v8i32(<8 x i32>* %a, <8 x i32>* %b, i1 %mask) #0 { define void @select_v8i32(<8 x i32>* %a, <8 x i32>* %b, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v8i32: ; CHECK-LABEL: select_v8i32:
; CHECK: and w[[AND:[0-9]+]], w2, #0x1 ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]] ; CHECK-NEXT: and w8, w2, #0x1
; CHECK-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].s ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: mov [[TMP1:z[0-9]+]].s, w[[AND]] ; CHECK-NEXT: ptrue p1.s
; CHECK-NEXT: and [[TMP2:z[0-9]+]].s, [[TMP1]].s, #0x1 ; CHECK-NEXT: mov z2.s, w8
; CHECK-NEXT: cmpne [[PRES:p[0-9]+]].s, [[PG2]]/z, [[TMP2]].s, #0 ; CHECK-NEXT: and z2.s, z2.s, #0x1
; CHECK-NEXT: sel [[RES:z[0-9]+]].s, [[PRES]], [[OP1]].s, [[OP2]].s ; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
; CHECK-NEXT: st1w { [[RES]].s }, [[PG1]], [x0] ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
; CHECK-NEXT: ret ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <8 x i32>, <8 x i32>* %a %op1 = load volatile <8 x i32>, <8 x i32>* %a
%op2 = load volatile <8 x i32>, <8 x i32>* %b %op2 = load volatile <8 x i32>, <8 x i32>* %b
%sel = select i1 %mask, <8 x i32> %op1, <8 x i32> %op2 %sel = select i1 %mask, <8 x i32> %op1, <8 x i32> %op2
@ -273,18 +311,38 @@ define void @select_v8i32(<8 x i32>* %a, <8 x i32>* %b, i1 %mask) #0 {
} }
define void @select_v16i32(<16 x i32>* %a, <16 x i32>* %b, i1 %mask) #0 { define void @select_v16i32(<16 x i32>* %a, <16 x i32>* %b, i1 %mask) #0 {
; CHECK-LABEL: select_v16i32: ; VBITS_GE_256-LABEL: select_v16i32:
; VBITS_GE_512: and w[[AND:[0-9]+]], w2, #0x1 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]] ; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0] ; VBITS_GE_256-NEXT: and w9, w2, #0x1
; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s ; VBITS_GE_256-NEXT: ptrue p1.s
; VBITS_GE_512-NEXT: mov [[TMP1:z[0-9]+]].s, w[[AND]] ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_512-NEXT: and [[TMP2:z[0-9]+]].s, [[TMP1]].s, #0x1 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: cmpne [[PRES:p[0-9]+]].s, [[PG2]]/z, [[TMP2]].s, #0 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].s, [[PRES]], [[OP1]].s, [[OP2]].s ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG1]], [x0] ; VBITS_GE_256-NEXT: mov z4.s, w9
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: and z4.s, z4.s, #0x1
; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z4.s, #0
; VBITS_GE_256-NEXT: sel z1.s, p1, z1.s, z3.s
; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z2.s
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: select_v16i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: and w8, w2, #0x1
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: ptrue p1.s
; VBITS_GE_512-NEXT: mov z2.s, w8
; VBITS_GE_512-NEXT: and z2.s, z2.s, #0x1
; VBITS_GE_512-NEXT: cmpne p1.s, p1/z, z2.s, #0
; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load volatile <16 x i32>, <16 x i32>* %a %op1 = load volatile <16 x i32>, <16 x i32>* %a
%op2 = load volatile <16 x i32>, <16 x i32>* %b %op2 = load volatile <16 x i32>, <16 x i32>* %b
%sel = select i1 %mask, <16 x i32> %op1, <16 x i32> %op2 %sel = select i1 %mask, <16 x i32> %op1, <16 x i32> %op2
@ -292,19 +350,20 @@ define void @select_v16i32(<16 x i32>* %a, <16 x i32>* %b, i1 %mask) #0 {
ret void ret void
} }
define void @select_v32i32(<32 x i32>* %a, <32 x i32>* %b, i1 %mask) #0 { define void @select_v32i32(<32 x i32>* %a, <32 x i32>* %b, i1 %mask) vscale_range(8,0) #0 {
; CHECK-LABEL: select_v32i32: ; CHECK-LABEL: select_v32i32:
; VBITS_GE_1024: and w[[AND:[0-9]+]], w2, #0x1 ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]] ; CHECK-NEXT: and w8, w2, #0x1
; VBITS_GE_1024-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_1024-NEXT: mov [[TMP1:z[0-9]+]].s, w[[AND]] ; CHECK-NEXT: ptrue p1.s
; VBITS_GE_1024-NEXT: and [[TMP2:z[0-9]+]].s, [[TMP1]].s, #0x1 ; CHECK-NEXT: mov z2.s, w8
; VBITS_GE_1024-NEXT: cmpne [[PRES:p[0-9]+]].s, [[PG2]]/z, [[TMP2]].s, #0 ; CHECK-NEXT: and z2.s, z2.s, #0x1
; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].s, [[PRES]], [[OP1]].s, [[OP2]].s ; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG1]], [x0] ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <32 x i32>, <32 x i32>* %a %op1 = load volatile <32 x i32>, <32 x i32>* %a
%op2 = load volatile <32 x i32>, <32 x i32>* %b %op2 = load volatile <32 x i32>, <32 x i32>* %b
%sel = select i1 %mask, <32 x i32> %op1, <32 x i32> %op2 %sel = select i1 %mask, <32 x i32> %op1, <32 x i32> %op2
@ -312,19 +371,20 @@ define void @select_v32i32(<32 x i32>* %a, <32 x i32>* %b, i1 %mask) #0 {
ret void ret void
} }
define void @select_v64i32(<64 x i32>* %a, <64 x i32>* %b, i1 %mask) #0 { define void @select_v64i32(<64 x i32>* %a, <64 x i32>* %b, i1 %mask) vscale_range(16,0) #0 {
; CHECK-LABEL: select_v64i32: ; CHECK-LABEL: select_v64i32:
; VBITS_GE_2048: and w[[AND:[0-9]+]], w2, #0x1 ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]] ; CHECK-NEXT: and w8, w2, #0x1
; VBITS_GE_2048-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_2048-NEXT: mov [[TMP1:z[0-9]+]].s, w[[AND]] ; CHECK-NEXT: ptrue p1.s
; VBITS_GE_2048-NEXT: and [[TMP2:z[0-9]+]].s, [[TMP1]].s, #0x1 ; CHECK-NEXT: mov z2.s, w8
; VBITS_GE_2048-NEXT: cmpne [[PRES:p[0-9]+]].s, [[PG2]]/z, [[TMP2]].s, #0 ; CHECK-NEXT: and z2.s, z2.s, #0x1
; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].s, [[PRES]], [[OP1]].s, [[OP2]].s ; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0
; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG1]], [x0] ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <64 x i32>, <64 x i32>* %a %op1 = load volatile <64 x i32>, <64 x i32>* %a
%op2 = load volatile <64 x i32>, <64 x i32>* %b %op2 = load volatile <64 x i32>, <64 x i32>* %b
%sel = select i1 %mask, <64 x i32> %op1, <64 x i32> %op2 %sel = select i1 %mask, <64 x i32> %op1, <64 x i32> %op2
@ -333,42 +393,45 @@ define void @select_v64i32(<64 x i32>* %a, <64 x i32>* %b, i1 %mask) #0 {
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) #0 { define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v1i64: ; CHECK-LABEL: select_v1i64:
; CHECK: tst w0, #0x1 ; CHECK: // %bb.0:
; CHECK-NEXT: csetm x8, ne ; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: fmov d2, x8 ; CHECK-NEXT: csetm x8, ne
; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: fmov d2, x8
; CHECK-NEXT: ret ; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
; CHECK-NEXT: ret
%sel = select i1 %mask, <1 x i64> %op1, <1 x i64> %op2 %sel = select i1 %mask, <1 x i64> %op1, <1 x i64> %op2
ret <1 x i64> %sel ret <1 x i64> %sel
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) #0 { define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v2i64: ; CHECK-LABEL: select_v2i64:
; CHECK: tst w0, #0x1 ; CHECK: // %bb.0:
; CHECK-NEXT: csetm x8, ne ; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: dup v2.2d, x8 ; CHECK-NEXT: csetm x8, ne
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: dup v2.2d, x8
; CHECK-NEXT: ret ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
; CHECK-NEXT: ret
%sel = select i1 %mask, <2 x i64> %op1, <2 x i64> %op2 %sel = select i1 %mask, <2 x i64> %op1, <2 x i64> %op2
ret <2 x i64> %sel ret <2 x i64> %sel
} }
define void @select_v4i64(<4 x i64>* %a, <4 x i64>* %b, i1 %mask) #0 { define void @select_v4i64(<4 x i64>* %a, <4 x i64>* %b, i1 %mask) vscale_range(2,0) #0 {
; CHECK-LABEL: select_v4i64: ; CHECK-LABEL: select_v4i64:
; CHECK: and w[[AND:[0-9]+]], w2, #0x1 ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]] ; CHECK-NEXT: and w8, w2, #0x1
; CHECK-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].d ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: mov [[TMP1:z[0-9]+]].d, x[[AND]] ; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: and [[TMP2:z[0-9]+]].d, [[TMP1]].d, #0x1 ; CHECK-NEXT: mov z2.d, x8
; CHECK-NEXT: cmpne [[PRES:p[0-9]+]].d, [[PG2]]/z, [[TMP2]].d, #0 ; CHECK-NEXT: and z2.d, z2.d, #0x1
; CHECK-NEXT: sel [[RES:z[0-9]+]].d, [[PRES]], [[OP1]].d, [[OP2]].d ; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
; CHECK-NEXT: st1d { [[RES]].d }, [[PG1]], [x0] ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
; CHECK-NEXT: ret ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <4 x i64>, <4 x i64>* %a %op1 = load volatile <4 x i64>, <4 x i64>* %a
%op2 = load volatile <4 x i64>, <4 x i64>* %b %op2 = load volatile <4 x i64>, <4 x i64>* %b
%sel = select i1 %mask, <4 x i64> %op1, <4 x i64> %op2 %sel = select i1 %mask, <4 x i64> %op1, <4 x i64> %op2
@ -377,18 +440,38 @@ define void @select_v4i64(<4 x i64>* %a, <4 x i64>* %b, i1 %mask) #0 {
} }
define void @select_v8i64(<8 x i64>* %a, <8 x i64>* %b, i1 %mask) #0 { define void @select_v8i64(<8 x i64>* %a, <8 x i64>* %b, i1 %mask) #0 {
; CHECK-LABEL: select_v8i64: ; VBITS_GE_256-LABEL: select_v8i64:
; VBITS_GE_512: and w[[AND:[0-9]+]], w2, #0x1 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]] ; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0] ; VBITS_GE_256-NEXT: and w9, w2, #0x1
; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d ; VBITS_GE_256-NEXT: ptrue p1.d
; VBITS_GE_512-NEXT: mov [[TMP1:z[0-9]+]].d, x[[AND]] ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_512-NEXT: and [[TMP2:z[0-9]+]].d, [[TMP1]].d, #0x1 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: cmpne [[PRES:p[0-9]+]].d, [[PG2]]/z, [[TMP2]].d, #0 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].d, [[PRES]], [[OP1]].d, [[OP2]].d ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG1]], [x0] ; VBITS_GE_256-NEXT: mov z4.d, x9
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: and z4.d, z4.d, #0x1
; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z4.d, #0
; VBITS_GE_256-NEXT: sel z1.d, p1, z1.d, z3.d
; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z2.d
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: select_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: and w8, w2, #0x1
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: ptrue p1.d
; VBITS_GE_512-NEXT: mov z2.d, x8
; VBITS_GE_512-NEXT: and z2.d, z2.d, #0x1
; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z2.d, #0
; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load volatile <8 x i64>, <8 x i64>* %a %op1 = load volatile <8 x i64>, <8 x i64>* %a
%op2 = load volatile <8 x i64>, <8 x i64>* %b %op2 = load volatile <8 x i64>, <8 x i64>* %b
%sel = select i1 %mask, <8 x i64> %op1, <8 x i64> %op2 %sel = select i1 %mask, <8 x i64> %op1, <8 x i64> %op2
@ -396,19 +479,20 @@ define void @select_v8i64(<8 x i64>* %a, <8 x i64>* %b, i1 %mask) #0 {
ret void ret void
} }
define void @select_v16i64(<16 x i64>* %a, <16 x i64>* %b, i1 %mask) #0 { define void @select_v16i64(<16 x i64>* %a, <16 x i64>* %b, i1 %mask) vscale_range(8,0) #0 {
; CHECK-LABEL: select_v16i64: ; CHECK-LABEL: select_v16i64:
; VBITS_GE_1024: and w[[AND:[0-9]+]], w2, #0x1 ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]] ; CHECK-NEXT: and w8, w2, #0x1
; VBITS_GE_1024-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_1024-NEXT: mov [[TMP1:z[0-9]+]].d, x[[AND]] ; CHECK-NEXT: ptrue p1.d
; VBITS_GE_1024-NEXT: and [[TMP2:z[0-9]+]].d, [[TMP1]].d, #0x1 ; CHECK-NEXT: mov z2.d, x8
; VBITS_GE_1024-NEXT: cmpne [[PRES:p[0-9]+]].d, [[PG2]]/z, [[TMP2]].d, #0 ; CHECK-NEXT: and z2.d, z2.d, #0x1
; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].d, [[PRES]], [[OP1]].d, [[OP2]].d ; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG1]], [x0] ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <16 x i64>, <16 x i64>* %a %op1 = load volatile <16 x i64>, <16 x i64>* %a
%op2 = load volatile <16 x i64>, <16 x i64>* %b %op2 = load volatile <16 x i64>, <16 x i64>* %b
%sel = select i1 %mask, <16 x i64> %op1, <16 x i64> %op2 %sel = select i1 %mask, <16 x i64> %op1, <16 x i64> %op2
@ -416,19 +500,20 @@ define void @select_v16i64(<16 x i64>* %a, <16 x i64>* %b, i1 %mask) #0 {
ret void ret void
} }
define void @select_v32i64(<32 x i64>* %a, <32 x i64>* %b, i1 %mask) #0 { define void @select_v32i64(<32 x i64>* %a, <32 x i64>* %b, i1 %mask) vscale_range(16,0) #0 {
; CHECK-LABEL: select_v32i64: ; CHECK-LABEL: select_v32i64:
; VBITS_GE_2048: and w[[AND:[0-9]+]], w2, #0x1 ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]] ; CHECK-NEXT: and w8, w2, #0x1
; VBITS_GE_2048-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_2048-NEXT: mov [[TMP1:z[0-9]+]].d, x[[AND]] ; CHECK-NEXT: ptrue p1.d
; VBITS_GE_2048-NEXT: and [[TMP2:z[0-9]+]].d, [[TMP1]].d, #0x1 ; CHECK-NEXT: mov z2.d, x8
; VBITS_GE_2048-NEXT: cmpne [[PRES:p[0-9]+]].d, [[PG2]]/z, [[TMP2]].d, #0 ; CHECK-NEXT: and z2.d, z2.d, #0x1
; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].d, [[PRES]], [[OP1]].d, [[OP2]].d ; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG1]], [x0] ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load volatile <32 x i64>, <32 x i64>* %a %op1 = load volatile <32 x i64>, <32 x i64>* %a
%op2 = load volatile <32 x i64>, <32 x i64>* %b %op2 = load volatile <32 x i64>, <32 x i64>* %b
%sel = select i1 %mask, <32 x i64> %op1, <32 x i64> %op2 %sel = select i1 %mask, <32 x i64> %op1, <32 x i64> %op2

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,35 +1,29 @@
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256 ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 ; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 ; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 ; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 ; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 ; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 ; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 ; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 ; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK ; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048
; VBYTES represents the useful byte size of a vector register from the code
; generator's point of view. It is clamped to power-of-2 values because
; only power-of-2 vector lengths are considered legal, regardless of the
; user specified vector length.
target triple = "aarch64-unknown-linux-gnu" target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE when its registers are no bigger than NEON.
; NO_SVE-NOT: ptrue
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <2 x float> @load_v2f32(<2 x float>* %a) #0 { define <2 x float> @load_v2f32(<2 x float>* %a) #0 {
; CHECK-LABEL: load_v2f32: ; CHECK-LABEL: load_v2f32:
; CHECK: ldr d0, [x0] ; CHECK: // %bb.0:
; CHECK: ret ; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ret
%load = load <2 x float>, <2 x float>* %a %load = load <2 x float>, <2 x float>* %a
ret <2 x float> %load ret <2 x float> %load
} }
@ -37,66 +31,164 @@ define <2 x float> @load_v2f32(<2 x float>* %a) #0 {
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <4 x float> @load_v4f32(<4 x float>* %a) #0 { define <4 x float> @load_v4f32(<4 x float>* %a) #0 {
; CHECK-LABEL: load_v4f32: ; CHECK-LABEL: load_v4f32:
; CHECK: ldr q0, [x0] ; CHECK: // %bb.0:
; CHECK: ret ; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ret
%load = load <4 x float>, <4 x float>* %a %load = load <4 x float>, <4 x float>* %a
ret <4 x float> %load ret <4 x float> %load
} }
define <8 x float> @load_v8f32(<8 x float>* %a) #0 { define <8 x float> @load_v8f32(<8 x float>* %a) #0 {
; CHECK-LABEL: load_v8f32: ; CHECK-LABEL: load_v8f32:
; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]] ; CHECK: // %bb.0:
; CHECK: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl8
; CHECK: ret ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: st1w { z0.s }, p0, [x8]
; CHECK-NEXT: ret
%load = load <8 x float>, <8 x float>* %a %load = load <8 x float>, <8 x float>* %a
ret <8 x float> %load ret <8 x float> %load
} }
define <16 x float> @load_v16f32(<16 x float>* %a) #0 { define <16 x float> @load_v16f32(<16 x float>* %a) #0 {
; CHECK-LABEL: load_v16f32: ; VBITS_GE_256-LABEL: load_v16f32:
; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]] ; VBITS_GE_256: // %bb.0:
; CHECK-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_LE_256-DAG: mov x[[A1:[0-9]+]], #[[#div(VBYTES,4)]] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A1]], lsl #2] ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
; CHECK: ret ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: load_v16f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_512-NEXT: ret
;
; VBITS_GE_1024-LABEL: load_v16f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_v16f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl16
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_2048-NEXT: ret
%load = load <16 x float>, <16 x float>* %a %load = load <16 x float>, <16 x float>* %a
ret <16 x float> %load ret <16 x float> %load
} }
define <32 x float> @load_v32f32(<32 x float>* %a) #0 { define <32 x float> @load_v32f32(<32 x float>* %a) #0 {
; CHECK-LABEL: load_v32f32: ; VBITS_GE_256-LABEL: load_v32f32:
; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]] ; VBITS_GE_256: // %bb.0:
; CHECK-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_LE_512-DAG: mov x[[A1:[0-9]+]], #[[#div(VBYTES,4)]] ; VBITS_GE_256-NEXT: mov x10, #24
; VBITS_LE_512-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A1]], lsl #2] ; VBITS_GE_256-NEXT: mov x11, #8
; VBITS_LE_256-DAG: mov x[[A2:[0-9]+]], #[[#mul(div(VBYTES,4),2)]] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A2]], lsl #2] ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
; VBITS_LE_256-DAG: mov x[[A3:[0-9]+]], #[[#mul(div(VBYTES,4),3)]] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A3]], lsl #2] ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
; CHECK: ret ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x8]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: load_v32f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov x9, #16
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_512-NEXT: st1w { z1.s }, p0, [x8]
; VBITS_GE_512-NEXT: ret
;
; VBITS_GE_1024-LABEL: load_v32f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_v32f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_2048-NEXT: ret
%load = load <32 x float>, <32 x float>* %a %load = load <32 x float>, <32 x float>* %a
ret <32 x float> %load ret <32 x float> %load
} }
define <64 x float> @load_v64f32(<64 x float>* %a) #0 { define <64 x float> @load_v64f32(<64 x float>* %a) #0 {
; CHECK-LABEL: load_v64f32: ; VBITS_GE_256-LABEL: load_v64f32:
; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]] ; VBITS_GE_256: // %bb.0:
; CHECK-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_LE_1024-DAG: mov x[[A1:[0-9]+]], #[[#div(VBYTES,4)]] ; VBITS_GE_256-NEXT: mov x10, #48
; VBITS_LE_1024-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A1]], lsl #2] ; VBITS_GE_256-NEXT: mov x11, #56
; VBITS_LE_512-DAG: mov x[[A2:[0-9]+]], #[[#mul(div(VBYTES,4),2)]] ; VBITS_GE_256-NEXT: mov x12, #32
; VBITS_LE_512-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A2]], lsl #2] ; VBITS_GE_256-NEXT: mov x13, #40
; VBITS_LE_512-DAG: mov x[[A3:[0-9]+]], #[[#mul(div(VBYTES,4),3)]] ; VBITS_GE_256-NEXT: mov x14, #16
; VBITS_LE_512-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A3]], lsl #2] ; VBITS_GE_256-NEXT: mov x15, #24
; VBITS_LE_256-DAG: mov x[[A4:[0-9]+]], #[[#mul(div(VBYTES,4),4)]] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A4]], lsl #2] ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x10, lsl #2]
; VBITS_LE_256-DAG: mov x[[A5:[0-9]+]], #[[#mul(div(VBYTES,4),5)]] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x11, lsl #2]
; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A5]], lsl #2] ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x12, lsl #2]
; VBITS_LE_256-DAG: mov x[[A6:[0-9]+]], #[[#mul(div(VBYTES,4),6)]] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x13, lsl #2]
; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A6]], lsl #2] ; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x14, lsl #2]
; VBITS_LE_256-DAG: mov x[[A7:[0-9]+]], #[[#mul(div(VBYTES,4),7)]] ; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x15, lsl #2]
; VBITS_LE_256-DAG: ld1w { z{{[0-9]+}}.s }, [[PG]]/z, [x0, x[[A7]], lsl #2] ; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x9, lsl #2]
; CHECK: ret ; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x8, x13, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x8, x15, lsl #2]
; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x8, x14, lsl #2]
; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x8]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: load_v64f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov x9, #32
; VBITS_GE_512-NEXT: mov x10, #48
; VBITS_GE_512-NEXT: mov x11, #16
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_512-NEXT: ld1w { z3.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1w { z1.s }, p0, [x8, x10, lsl #2]
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_512-NEXT: st1w { z2.s }, p0, [x8, x11, lsl #2]
; VBITS_GE_512-NEXT: st1w { z3.s }, p0, [x8]
; VBITS_GE_512-NEXT: ret
;
; VBITS_GE_1024-LABEL: load_v64f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: mov x9, #32
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: load_v64f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_2048-NEXT: ret
%load = load <64 x float>, <64 x float>* %a %load = load <64 x float>, <64 x float>* %a
ret <64 x float> %load ret <64 x float> %load
} }

File diff suppressed because it is too large Load Diff

View File

@ -1,18 +1,7 @@
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu" target triple = "aarch64-unknown-linux-gnu"
@ -20,7 +9,7 @@ target triple = "aarch64-unknown-linux-gnu"
; LD1B ; LD1B
; ;
define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) #0 { define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v2i8: ; CHECK-LABEL: masked_gather_v2i8:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x1] ; CHECK-NEXT: ldr q0, [x1]
@ -36,7 +25,7 @@ define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) #0 {
ret void ret void
} }
define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 { define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v4i8: ; CHECK-LABEL: masked_gather_v4i8:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ptrue p0.d, vl4
@ -54,21 +43,21 @@ define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 {
} }
define void @masked_gather_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 { define void @masked_gather_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 {
; VBITS_EQ_256-LABEL: masked_gather_v8i8: ; VBITS_GE_256-LABEL: masked_gather_v8i8:
; VBITS_EQ_256: // %bb.0: ; VBITS_GE_256: // %bb.0:
; VBITS_EQ_256-NEXT: mov x8, #4 ; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_EQ_256-NEXT: ld1b { z0.d }, p0/z, [z0.d] ; VBITS_GE_256-NEXT: ld1b { z0.d }, p0/z, [z0.d]
; VBITS_EQ_256-NEXT: ld1b { z1.d }, p0/z, [z1.d] ; VBITS_GE_256-NEXT: ld1b { z1.d }, p0/z, [z1.d]
; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
; VBITS_EQ_256-NEXT: uzp1 v0.8b, v1.8b, v0.8b ; VBITS_GE_256-NEXT: uzp1 v0.8b, v1.8b, v0.8b
; VBITS_EQ_256-NEXT: str d0, [x0] ; VBITS_GE_256-NEXT: str d0, [x0]
; VBITS_EQ_256-NEXT: ret ; VBITS_GE_256-NEXT: ret
; ;
; VBITS_GE_512-LABEL: masked_gather_v8i8: ; VBITS_GE_512-LABEL: masked_gather_v8i8:
; VBITS_GE_512: // %bb.0: ; VBITS_GE_512: // %bb.0:
@ -86,17 +75,17 @@ define void @masked_gather_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 {
ret void ret void
} }
define void @masked_gather_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 { define void @masked_gather_v16i8(<16 x i8>* %a, <16 x i8*>* %b) vscale_range(8,0) #0 {
; VBITS_GE_1024-LABEL: masked_gather_v16i8: ; CHECK-LABEL: masked_gather_v16i8:
; VBITS_GE_1024: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1]
; VBITS_GE_1024-NEXT: ld1b { z0.d }, p0/z, [z0.d] ; CHECK-NEXT: ld1b { z0.d }, p0/z, [z0.d]
; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_1024-NEXT: uzp1 z0.b, z0.b, z0.b ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; VBITS_GE_1024-NEXT: str q0, [x0] ; CHECK-NEXT: str q0, [x0]
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: ret
%ptrs = load <16 x i8*>, <16 x i8*>* %b %ptrs = load <16 x i8*>, <16 x i8*>* %b
%vals = call <16 x i8> @llvm.masked.gather.v16i8(<16 x i8*> %ptrs, i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, %vals = call <16 x i8> @llvm.masked.gather.v16i8(<16 x i8*> %ptrs, i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef) i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
@ -104,18 +93,18 @@ define void @masked_gather_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 {
ret void ret void
} }
define void @masked_gather_v32i8(<32 x i8>* %a, <32 x i8*>* %b) #0 { define void @masked_gather_v32i8(<32 x i8>* %a, <32 x i8*>* %b) vscale_range(16,0) #0 {
; VBITS_GE_2048-LABEL: masked_gather_v32i8: ; CHECK-LABEL: masked_gather_v32i8:
; VBITS_GE_2048: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1]
; VBITS_GE_2048-NEXT: ld1b { z0.d }, p0/z, [z0.d] ; CHECK-NEXT: ld1b { z0.d }, p0/z, [z0.d]
; VBITS_GE_2048-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: ptrue p0.b, vl32
; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_2048-NEXT: uzp1 z0.b, z0.b, z0.b ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; VBITS_GE_2048-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: ret
%ptrs = load <32 x i8*>, <32 x i8*>* %b %ptrs = load <32 x i8*>, <32 x i8*>* %b
%vals = call <32 x i8> @llvm.masked.gather.v32i8(<32 x i8*> %ptrs, i32 8, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, %vals = call <32 x i8> @llvm.masked.gather.v32i8(<32 x i8*> %ptrs, i32 8, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
@ -129,7 +118,7 @@ define void @masked_gather_v32i8(<32 x i8>* %a, <32 x i8*>* %b) #0 {
; LD1H ; LD1H
; ;
define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) #0 { define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v2i16: ; CHECK-LABEL: masked_gather_v2i16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x1] ; CHECK-NEXT: ldr q0, [x1]
@ -145,7 +134,7 @@ define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) #0 {
ret void ret void
} }
define void @masked_gather_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 { define void @masked_gather_v4i16(<4 x i16>* %a, <4 x i16*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v4i16: ; CHECK-LABEL: masked_gather_v4i16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ptrue p0.d, vl4
@ -162,21 +151,21 @@ define void @masked_gather_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 {
} }
define void @masked_gather_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 { define void @masked_gather_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 {
; VBITS_EQ_256-LABEL: masked_gather_v8i16: ; VBITS_GE_256-LABEL: masked_gather_v8i16:
; VBITS_EQ_256: // %bb.0: ; VBITS_GE_256: // %bb.0:
; VBITS_EQ_256-NEXT: mov x8, #4 ; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_EQ_256-NEXT: ld1h { z0.d }, p0/z, [z0.d] ; VBITS_GE_256-NEXT: ld1h { z0.d }, p0/z, [z0.d]
; VBITS_EQ_256-NEXT: ld1h { z1.d }, p0/z, [z1.d] ; VBITS_GE_256-NEXT: ld1h { z1.d }, p0/z, [z1.d]
; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
; VBITS_EQ_256-NEXT: mov v1.d[1], v0.d[0] ; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
; VBITS_EQ_256-NEXT: str q1, [x0] ; VBITS_GE_256-NEXT: str q1, [x0]
; VBITS_EQ_256-NEXT: ret ; VBITS_GE_256-NEXT: ret
; ;
; VBITS_GE_512-LABEL: masked_gather_v8i16: ; VBITS_GE_512-LABEL: masked_gather_v8i16:
; VBITS_GE_512: // %bb.0: ; VBITS_GE_512: // %bb.0:
@ -193,17 +182,17 @@ define void @masked_gather_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 {
ret void ret void
} }
define void @masked_gather_v16i16(<16 x i16>* %a, <16 x i16*>* %b) #0 { define void @masked_gather_v16i16(<16 x i16>* %a, <16 x i16*>* %b) vscale_range(8,0) #0 {
; VBITS_GE_1024-LABEL: masked_gather_v16i16: ; CHECK-LABEL: masked_gather_v16i16:
; VBITS_GE_1024: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1]
; VBITS_GE_1024-NEXT: ld1h { z0.d }, p0/z, [z0.d] ; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d]
; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: ret
%ptrs = load <16 x i16*>, <16 x i16*>* %b %ptrs = load <16 x i16*>, <16 x i16*>* %b
%vals = call <16 x i16> @llvm.masked.gather.v16i16(<16 x i16*> %ptrs, i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, %vals = call <16 x i16> @llvm.masked.gather.v16i16(<16 x i16*> %ptrs, i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i16> undef) i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i16> undef)
@ -211,17 +200,17 @@ define void @masked_gather_v16i16(<16 x i16>* %a, <16 x i16*>* %b) #0 {
ret void ret void
} }
define void @masked_gather_v32i16(<32 x i16>* %a, <32 x i16*>* %b) #0 { define void @masked_gather_v32i16(<32 x i16>* %a, <32 x i16*>* %b) vscale_range(16,0) #0 {
; VBITS_GE_2048-LABEL: masked_gather_v32i16: ; CHECK-LABEL: masked_gather_v32i16:
; VBITS_GE_2048: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1]
; VBITS_GE_2048-NEXT: ld1h { z0.d }, p0/z, [z0.d] ; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d]
; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 ; CHECK-NEXT: ptrue p0.h, vl32
; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: ret
%ptrs = load <32 x i16*>, <32 x i16*>* %b %ptrs = load <32 x i16*>, <32 x i16*>* %b
%vals = call <32 x i16> @llvm.masked.gather.v32i16(<32 x i16*> %ptrs, i32 8, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, %vals = call <32 x i16> @llvm.masked.gather.v32i16(<32 x i16*> %ptrs, i32 8, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
@ -235,7 +224,7 @@ define void @masked_gather_v32i16(<32 x i16>* %a, <32 x i16*>* %b) #0 {
; LD1W ; LD1W
; ;
define void @masked_gather_v2i32(<2 x i32>* %a, <2 x i32*>* %b) #0 { define void @masked_gather_v2i32(<2 x i32>* %a, <2 x i32*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v2i32: ; CHECK-LABEL: masked_gather_v2i32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x1] ; CHECK-NEXT: ldr q0, [x1]
@ -250,7 +239,7 @@ define void @masked_gather_v2i32(<2 x i32>* %a, <2 x i32*>* %b) #0 {
ret void ret void
} }
define void @masked_gather_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 { define void @masked_gather_v4i32(<4 x i32>* %a, <4 x i32*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v4i32: ; CHECK-LABEL: masked_gather_v4i32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ptrue p0.d, vl4
@ -266,21 +255,21 @@ define void @masked_gather_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 {
} }
define void @masked_gather_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 { define void @masked_gather_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 {
; VBITS_EQ_256-LABEL: masked_gather_v8i32: ; VBITS_GE_256-LABEL: masked_gather_v8i32:
; VBITS_EQ_256: // %bb.0: ; VBITS_GE_256: // %bb.0:
; VBITS_EQ_256-NEXT: mov x8, #4 ; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_EQ_256-NEXT: ld1w { z0.d }, p0/z, [z0.d] ; VBITS_GE_256-NEXT: ld1w { z0.d }, p0/z, [z0.d]
; VBITS_EQ_256-NEXT: ld1w { z1.d }, p0/z, [z1.d] ; VBITS_GE_256-NEXT: ld1w { z1.d }, p0/z, [z1.d]
; VBITS_EQ_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: ptrue p0.s, vl4
; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
; VBITS_EQ_256-NEXT: splice z1.s, p0, z1.s, z0.s ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s
; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0] ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_EQ_256-NEXT: ret ; VBITS_GE_256-NEXT: ret
; ;
; VBITS_GE_512-LABEL: masked_gather_v8i32: ; VBITS_GE_512-LABEL: masked_gather_v8i32:
; VBITS_GE_512: // %bb.0: ; VBITS_GE_512: // %bb.0:
@ -297,16 +286,16 @@ define void @masked_gather_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 {
ret void ret void
} }
define void @masked_gather_v16i32(<16 x i32>* %a, <16 x i32*>* %b) #0 { define void @masked_gather_v16i32(<16 x i32>* %a, <16 x i32*>* %b) vscale_range(8,0) #0 {
; VBITS_GE_1024-LABEL: masked_gather_v16i32: ; CHECK-LABEL: masked_gather_v16i32:
; VBITS_GE_1024: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1]
; VBITS_GE_1024-NEXT: ld1w { z0.d }, p0/z, [z0.d] ; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d]
; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 ; CHECK-NEXT: ptrue p0.s, vl16
; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: ret
%ptrs = load <16 x i32*>, <16 x i32*>* %b %ptrs = load <16 x i32*>, <16 x i32*>* %b
%vals = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, %vals = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef) i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
@ -314,16 +303,16 @@ define void @masked_gather_v16i32(<16 x i32>* %a, <16 x i32*>* %b) #0 {
ret void ret void
} }
define void @masked_gather_v32i32(<32 x i32>* %a, <32 x i32*>* %b) #0 { define void @masked_gather_v32i32(<32 x i32>* %a, <32 x i32*>* %b) vscale_range(16,0) #0 {
; VBITS_GE_2048-LABEL: masked_gather_v32i32: ; CHECK-LABEL: masked_gather_v32i32:
; VBITS_GE_2048: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1]
; VBITS_GE_2048-NEXT: ld1w { z0.d }, p0/z, [z0.d] ; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d]
; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: ret
%ptrs = load <32 x i32*>, <32 x i32*>* %b %ptrs = load <32 x i32*>, <32 x i32*>* %b
%vals = call <32 x i32> @llvm.masked.gather.v32i32(<32 x i32*> %ptrs, i32 8, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, %vals = call <32 x i32> @llvm.masked.gather.v32i32(<32 x i32*> %ptrs, i32 8, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
@ -337,7 +326,7 @@ define void @masked_gather_v32i32(<32 x i32>* %a, <32 x i32*>* %b) #0 {
; LD1D ; LD1D
; ;
define void @masked_gather_v2i64(<2 x i64>* %a, <2 x i64*>* %b) #0 { define void @masked_gather_v2i64(<2 x i64>* %a, <2 x i64*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v2i64: ; CHECK-LABEL: masked_gather_v2i64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x1] ; CHECK-NEXT: ldr q0, [x1]
@ -351,7 +340,7 @@ define void @masked_gather_v2i64(<2 x i64>* %a, <2 x i64*>* %b) #0 {
ret void ret void
} }
define void @masked_gather_v4i64(<4 x i64>* %a, <4 x i64*>* %b) #0 { define void @masked_gather_v4i64(<4 x i64>* %a, <4 x i64*>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v4i64: ; CHECK-LABEL: masked_gather_v4i64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ptrue p0.d, vl4
@ -366,17 +355,17 @@ define void @masked_gather_v4i64(<4 x i64>* %a, <4 x i64*>* %b) #0 {
} }
define void @masked_gather_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 { define void @masked_gather_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 {
; VBITS_EQ_256-LABEL: masked_gather_v8i64: ; VBITS_GE_256-LABEL: masked_gather_v8i64:
; VBITS_EQ_256: // %bb.0: ; VBITS_GE_256: // %bb.0:
; VBITS_EQ_256-NEXT: mov x8, #4 ; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [z0.d] ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [z0.d]
; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [z1.d] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [z1.d]
; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0] ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_EQ_256-NEXT: ret ; VBITS_GE_256-NEXT: ret
; ;
; VBITS_GE_512-LABEL: masked_gather_v8i64: ; VBITS_GE_512-LABEL: masked_gather_v8i64:
; VBITS_GE_512: // %bb.0: ; VBITS_GE_512: // %bb.0:
@ -391,14 +380,14 @@ define void @masked_gather_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 {
ret void ret void
} }
define void @masked_gather_v16i64(<16 x i64>* %a, <16 x i64*>* %b) #0 { define void @masked_gather_v16i64(<16 x i64>* %a, <16 x i64*>* %b) vscale_range(8,0) #0 {
; VBITS_GE_1024-LABEL: masked_gather_v16i64: ; CHECK-LABEL: masked_gather_v16i64:
; VBITS_GE_1024: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1]
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [z0.d] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [z0.d]
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: ret
%ptrs = load <16 x i64*>, <16 x i64*>* %b %ptrs = load <16 x i64*>, <16 x i64*>* %b
%vals = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, %vals = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i64> undef) i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i64> undef)
@ -406,14 +395,14 @@ define void @masked_gather_v16i64(<16 x i64>* %a, <16 x i64*>* %b) #0 {
ret void ret void
} }
define void @masked_gather_v32i64(<32 x i64>* %a, <32 x i64*>* %b) #0 { define void @masked_gather_v32i64(<32 x i64>* %a, <32 x i64*>* %b) vscale_range(16,0) #0 {
; VBITS_GE_2048-LABEL: masked_gather_v32i64: ; CHECK-LABEL: masked_gather_v32i64:
; VBITS_GE_2048: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1]
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [z0.d] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [z0.d]
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: ret
%ptrs = load <32 x i64*>, <32 x i64*>* %b %ptrs = load <32 x i64*>, <32 x i64*>* %b
%vals = call <32 x i64> @llvm.masked.gather.v32i64(<32 x i64*> %ptrs, i32 8, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, %vals = call <32 x i64> @llvm.masked.gather.v32i64(<32 x i64*> %ptrs, i32 8, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true,

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,28 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; Don't use SVE when its registers are no bigger than NEON. ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
target triple = "aarch64-unknown-linux-gnu" target triple = "aarch64-unknown-linux-gnu"
;; ;
;; Masked Stores ; Masked Stores
;; ;
define void @masked_store_v2f16(<2 x half>* %ap, <2 x half>* %bp) #0 {
define void @masked_store_v2f16(<2 x half>* %ap, <2 x half>* %bp) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_store_v2f16: ; CHECK-LABEL: masked_store_v2f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ldr s1, [x0] ; CHECK-NEXT: ldr s1, [x0]
@ -52,8 +39,7 @@ define void @masked_store_v2f16(<2 x half>* %ap, <2 x half>* %bp) #0 {
ret void ret void
} }
define void @masked_store_v2f32(<2 x float>* %ap, <2 x float>* %bp) vscale_range(2,0) #0 {
define void @masked_store_v2f32(<2 x float>* %ap, <2 x float>* %bp) #0 {
; CHECK-LABEL: masked_store_v2f32: ; CHECK-LABEL: masked_store_v2f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d0, [x0]
@ -70,7 +56,7 @@ define void @masked_store_v2f32(<2 x float>* %ap, <2 x float>* %bp) #0 {
ret void ret void
} }
define void @masked_store_v4f32(<4 x float>* %ap, <4 x float>* %bp) #0 { define void @masked_store_v4f32(<4 x float>* %ap, <4 x float>* %bp) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_store_v4f32: ; CHECK-LABEL: masked_store_v4f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q0, [x0]
@ -87,7 +73,7 @@ define void @masked_store_v4f32(<4 x float>* %ap, <4 x float>* %bp) #0 {
ret void ret void
} }
define void @masked_store_v8f32(<8 x float>* %ap, <8 x float>* %bp) #0 { define void @masked_store_v8f32(<8 x float>* %ap, <8 x float>* %bp) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_store_v8f32: ; CHECK-LABEL: masked_store_v8f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ptrue p0.s, vl8
@ -133,39 +119,15 @@ define void @masked_store_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0 {
ret void ret void
} }
define void @masked_store_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0 { define void @masked_store_v32f32(<32 x float>* %ap, <32 x float>* %bp) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: masked_store_v32f32: ; CHECK-LABEL: masked_store_v32f32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #24 ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_256-NEXT: mov x9, #16 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #8 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z2.s, z5.s
; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z4.s
; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z0.s, z6.s
; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z3.s, z7.s
; VBITS_GE_256-NEXT: st1w { z0.s }, p3, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p2, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p1, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: masked_store_v32f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_1024-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
%a = load <32 x float>, <32 x float>* %ap %a = load <32 x float>, <32 x float>* %ap
%b = load <32 x float>, <32 x float>* %bp %b = load <32 x float>, <32 x float>* %bp
%mask = fcmp oeq <32 x float> %a, %b %mask = fcmp oeq <32 x float> %a, %b
@ -173,59 +135,15 @@ define void @masked_store_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0 {
ret void ret void
} }
define void @masked_store_v64f32(<64 x float>* %ap, <64 x float>* %bp) #0 { define void @masked_store_v64f32(<64 x float>* %ap, <64 x float>* %bp) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: masked_store_v64f32: ; CHECK-LABEL: masked_store_v64f32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #56 ; CHECK-NEXT: ptrue p0.s, vl64
; VBITS_GE_256-NEXT: mov x9, #48 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #40 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: mov x11, #32 ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s
; VBITS_GE_256-NEXT: mov x12, #24 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_256-NEXT: mov x13, #16 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov x14, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1, x14, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z18.s }, p0/z, [x1, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z19.s }, p0/z, [x1, x12, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z20.s }, p0/z, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z21.s }, p0/z, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z22.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z23.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z6.s, z17.s
; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z5.s, z16.s
; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z4.s, z19.s
; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z3.s, z18.s
; VBITS_GE_256-NEXT: fcmeq p5.s, p0/z, z2.s, z21.s
; VBITS_GE_256-NEXT: fcmeq p6.s, p0/z, z1.s, z20.s
; VBITS_GE_256-NEXT: fcmeq p7.s, p0/z, z0.s, z22.s
; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z7.s, z23.s
; VBITS_GE_256-NEXT: st1w { z0.s }, p7, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p6, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p5, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p4, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z4.s }, p3, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z5.s }, p2, [x0, x13, lsl #2]
; VBITS_GE_256-NEXT: st1w { z6.s }, p1, [x0, x14, lsl #2]
; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: masked_store_v64f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
%a = load <64 x float>, <64 x float>* %ap %a = load <64 x float>, <64 x float>* %ap
%b = load <64 x float>, <64 x float>* %bp %b = load <64 x float>, <64 x float>* %bp
%mask = fcmp oeq <64 x float> %a, %b %mask = fcmp oeq <64 x float> %a, %b
@ -266,7 +184,6 @@ define void @masked_store_trunc_v8i64i8(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i8>
; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d ; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d
; VBITS_GE_512-NEXT: st1b { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: st1b { z0.d }, p0, [x2]
; VBITS_GE_512-NEXT: ret ; VBITS_GE_512-NEXT: ret
%a = load <8 x i64>, <8 x i64>* %ap %a = load <8 x i64>, <8 x i64>* %ap
%b = load <8 x i64>, <8 x i64>* %bp %b = load <8 x i64>, <8 x i64>* %bp
%mask = icmp eq <8 x i64> %a, %b %mask = icmp eq <8 x i64> %a, %b

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu" target triple = "aarch64-unknown-linux-gnu"
@ -163,27 +163,27 @@ define void @test_revwv8i32v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
; REVH pattern for shuffle v32i16 with 256 bits and 512 bits SVE. ; REVH pattern for shuffle v32i16 with 256 bits and 512 bits SVE.
define void @test_revhv32i16(<32 x i16>* %a) #0 { define void @test_revhv32i16(<32 x i16>* %a) #0 {
; VBITS_EQ_256-LABEL: test_revhv32i16:
; VBITS_EQ_256: // %bb.0:
; VBITS_EQ_256-NEXT: mov x8, #16
; VBITS_EQ_256-NEXT: ptrue p0.h, vl16
; VBITS_EQ_256-NEXT: ptrue p1.d
; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_EQ_256-NEXT: revh z0.d, p1/m, z0.d
; VBITS_EQ_256-NEXT: revh z1.d, p1/m, z1.d
; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_EQ_256-NEXT: ret
;
; VBITS_GE_256-LABEL: test_revhv32i16: ; VBITS_GE_256-LABEL: test_revhv32i16:
; VBITS_GE_256: // %bb.0: ; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.h, vl32 ; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ptrue p1.d ; VBITS_GE_256-NEXT: ptrue p1.d
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: revh z0.d, p1/m, z0.d ; VBITS_GE_256-NEXT: revh z0.d, p1/m, z0.d
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_256-NEXT: revh z1.d, p1/m, z1.d
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret ; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: test_revhv32i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ptrue p1.d
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: revh z0.d, p1/m, z0.d
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%tmp1 = load <32 x i16>, <32 x i16>* %a %tmp1 = load <32 x i16>, <32 x i16>* %a
%tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 undef, i32 undef, i32 undef, i32 31, i32 30, i32 29, i32 undef> %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 undef, i32 undef, i32 undef, i32 31, i32 30, i32 29, i32 undef>
store <32 x i16> %tmp2, <32 x i16>* %a store <32 x i16> %tmp2, <32 x i16>* %a

View File

@ -1,54 +1,46 @@
; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
target triple = "aarch64-unknown-linux-gnu" target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE when its registers are no bigger than NEON.
; NO_SVE-NOT: ptrue
; ;
; RBIT ; RBIT
; ;
define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) #0 { define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v8i8: ; CHECK-LABEL: bitreverse_v8i8:
; CHECK: ptrue [[PG:p[0-9]+]].b, vl8 ; CHECK: // %bb.0:
; CHECK-NEXT: rbit z0.b, [[PG]]/m, z0.b ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ret ; CHECK-NEXT: ptrue p0.b, vl8
; CHECK-NEXT: rbit z0.b, p0/m, z0.b
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %op) %res = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %op)
ret <8 x i8> %res ret <8 x i8> %res
} }
define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) #0 { define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v16i8: ; CHECK-LABEL: bitreverse_v16i8:
; CHECK: ptrue [[PG:p[0-9]+]].b, vl16 ; CHECK: // %bb.0:
; CHECK-NEXT: rbit z0.b, [[PG]]/m, z0.b ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ret ; CHECK-NEXT: ptrue p0.b, vl16
; CHECK-NEXT: rbit z0.b, p0/m, z0.b
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %op) %res = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %op)
ret <16 x i8> %res ret <16 x i8> %res
} }
define void @bitreverse_v32i8(<32 x i8>* %a) #0 { define void @bitreverse_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v32i8: ; CHECK-LABEL: bitreverse_v32i8:
; CHECK: ptrue [[PG:p[0-9]+]].b, vl32 ; CHECK: // %bb.0:
; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.b, vl32
; CHECK-NEXT: rbit [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0] ; CHECK-NEXT: rbit z0.b, p0/m, z0.b
; CHECK-NEXT: ret ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op = load <32 x i8>, <32 x i8>* %a %op = load <32 x i8>, <32 x i8>* %a
%res = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %op) %res = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %op)
store <32 x i8> %res, <32 x i8>* %a store <32 x i8> %res, <32 x i8>* %a
@ -56,80 +48,91 @@ define void @bitreverse_v32i8(<32 x i8>* %a) #0 {
} }
define void @bitreverse_v64i8(<64 x i8>* %a) #0 { define void @bitreverse_v64i8(<64 x i8>* %a) #0 {
; CHECK-LABEL: bitreverse_v64i8: ; VBITS_GE_256-LABEL: bitreverse_v64i8:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov w8, #32
; VBITS_GE_512-NEXT: rbit [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0] ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
; VBITS_GE_256-NEXT: rbit z0.b, p0/m, z0.b
; VBITS_GE_256-NEXT: rbit z1.b, p0/m, z1.b
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
; VBITS_GE_256-NEXT: ret
; ;
; Ensure sensible type legalisation. ; VBITS_GE_512-LABEL: bitreverse_v64i8:
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32 ; VBITS_GE_512: // %bb.0:
; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
; VBITS_EQ_256-DAG: ld1b { [[OP_LO:z[0-9]+]].b }, [[PG]]/z, [x0] ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_EQ_256-DAG: ld1b { [[OP_HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]] ; VBITS_GE_512-NEXT: rbit z0.b, p0/m, z0.b
; VBITS_EQ_256-DAG: rbit [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP_LO]].b ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
; VBITS_EQ_256-DAG: rbit [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP_HI]].b ; VBITS_GE_512-NEXT: ret
; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0]
; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0, x[[NUMELTS]]]
; VBITS_EQ_256-NEXT: ret
%op = load <64 x i8>, <64 x i8>* %a %op = load <64 x i8>, <64 x i8>* %a
%res = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %op) %res = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %op)
store <64 x i8> %res, <64 x i8>* %a store <64 x i8> %res, <64 x i8>* %a
ret void ret void
} }
define void @bitreverse_v128i8(<128 x i8>* %a) #0 { define void @bitreverse_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: bitreverse_v128i8: ; CHECK-LABEL: bitreverse_v128i8:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128 ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.b, vl128
; VBITS_GE_1024-NEXT: rbit [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0] ; CHECK-NEXT: rbit z0.b, p0/m, z0.b
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op = load <128 x i8>, <128 x i8>* %a %op = load <128 x i8>, <128 x i8>* %a
%res = call <128 x i8> @llvm.bitreverse.v128i8(<128 x i8> %op) %res = call <128 x i8> @llvm.bitreverse.v128i8(<128 x i8> %op)
store <128 x i8> %res, <128 x i8>* %a store <128 x i8> %res, <128 x i8>* %a
ret void ret void
} }
define void @bitreverse_v256i8(<256 x i8>* %a) #0 { define void @bitreverse_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: bitreverse_v256i8: ; CHECK-LABEL: bitreverse_v256i8:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256 ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.b, vl256
; VBITS_GE_2048-NEXT: rbit [[RES:z[0-9]+]].b, [[PG]]/m, [[OP]].b ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0] ; CHECK-NEXT: rbit z0.b, p0/m, z0.b
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op = load <256 x i8>, <256 x i8>* %a %op = load <256 x i8>, <256 x i8>* %a
%res = call <256 x i8> @llvm.bitreverse.v256i8(<256 x i8> %op) %res = call <256 x i8> @llvm.bitreverse.v256i8(<256 x i8> %op)
store <256 x i8> %res, <256 x i8>* %a store <256 x i8> %res, <256 x i8>* %a
ret void ret void
} }
define <4 x i16> @bitreverse_v4i16(<4 x i16> %op) #0 { define <4 x i16> @bitreverse_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v4i16: ; CHECK-LABEL: bitreverse_v4i16:
; CHECK: ptrue [[PG:p[0-9]+]].h, vl4 ; CHECK: // %bb.0:
; CHECK-NEXT: rbit z0.h, [[PG]]/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ret ; CHECK-NEXT: ptrue p0.h, vl4
; CHECK-NEXT: rbit z0.h, p0/m, z0.h
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = call <4 x i16> @llvm.bitreverse.v4i16(<4 x i16> %op) %res = call <4 x i16> @llvm.bitreverse.v4i16(<4 x i16> %op)
ret <4 x i16> %res ret <4 x i16> %res
} }
define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) #0 { define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v8i16: ; CHECK-LABEL: bitreverse_v8i16:
; CHECK: ptrue [[PG:p[0-9]+]].h, vl8 ; CHECK: // %bb.0:
; CHECK-NEXT: rbit z0.h, [[PG]]/m, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ret ; CHECK-NEXT: ptrue p0.h, vl8
; CHECK-NEXT: rbit z0.h, p0/m, z0.h
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %op) %res = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %op)
ret <8 x i16> %res ret <8 x i16> %res
} }
define void @bitreverse_v16i16(<16 x i16>* %a) #0 { define void @bitreverse_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v16i16: ; CHECK-LABEL: bitreverse_v16i16:
; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 ; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: rbit [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; CHECK-NEXT: rbit z0.h, p0/m, z0.h
; CHECK-NEXT: ret ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op = load <16 x i16>, <16 x i16>* %a %op = load <16 x i16>, <16 x i16>* %a
%res = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %op) %res = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %op)
store <16 x i16> %res, <16 x i16>* %a store <16 x i16> %res, <16 x i16>* %a
@ -137,80 +140,91 @@ define void @bitreverse_v16i16(<16 x i16>* %a) #0 {
} }
define void @bitreverse_v32i16(<32 x i16>* %a) #0 { define void @bitreverse_v32i16(<32 x i16>* %a) #0 {
; CHECK-LABEL: bitreverse_v32i16: ; VBITS_GE_256-LABEL: bitreverse_v32i16:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_512-NEXT: rbit [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: rbit z0.h, p0/m, z0.h
; Ensure sensible type legalisation. ; VBITS_GE_256-NEXT: rbit z1.h, p0/m, z1.h
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: ret
; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1] ;
; VBITS_EQ_256-DAG: rbit [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h ; VBITS_GE_512-LABEL: bitreverse_v32i16:
; VBITS_EQ_256-DAG: rbit [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h ; VBITS_GE_512: // %bb.0:
; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1] ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_EQ_256-NEXT: ret ; VBITS_GE_512-NEXT: rbit z0.h, p0/m, z0.h
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op = load <32 x i16>, <32 x i16>* %a %op = load <32 x i16>, <32 x i16>* %a
%res = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %op) %res = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %op)
store <32 x i16> %res, <32 x i16>* %a store <32 x i16> %res, <32 x i16>* %a
ret void ret void
} }
define void @bitreverse_v64i16(<64 x i16>* %a) #0 { define void @bitreverse_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: bitreverse_v64i16: ; CHECK-LABEL: bitreverse_v64i16:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: rbit [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; CHECK-NEXT: rbit z0.h, p0/m, z0.h
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op = load <64 x i16>, <64 x i16>* %a %op = load <64 x i16>, <64 x i16>* %a
%res = call <64 x i16> @llvm.bitreverse.v64i16(<64 x i16> %op) %res = call <64 x i16> @llvm.bitreverse.v64i16(<64 x i16> %op)
store <64 x i16> %res, <64 x i16>* %a store <64 x i16> %res, <64 x i16>* %a
ret void ret void
} }
define void @bitreverse_v128i16(<128 x i16>* %a) #0 { define void @bitreverse_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: bitreverse_v128i16: ; CHECK-LABEL: bitreverse_v128i16:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048-NEXT: rbit [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; CHECK-NEXT: rbit z0.h, p0/m, z0.h
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op = load <128 x i16>, <128 x i16>* %a %op = load <128 x i16>, <128 x i16>* %a
%res = call <128 x i16> @llvm.bitreverse.v128i16(<128 x i16> %op) %res = call <128 x i16> @llvm.bitreverse.v128i16(<128 x i16> %op)
store <128 x i16> %res, <128 x i16>* %a store <128 x i16> %res, <128 x i16>* %a
ret void ret void
} }
define <2 x i32> @bitreverse_v2i32(<2 x i32> %op) #0 { define <2 x i32> @bitreverse_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v2i32: ; CHECK-LABEL: bitreverse_v2i32:
; CHECK: ptrue [[PG:p[0-9]+]].s, vl2 ; CHECK: // %bb.0:
; CHECK-NEXT: rbit z0.s, [[PG]]/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ret ; CHECK-NEXT: ptrue p0.s, vl2
; CHECK-NEXT: rbit z0.s, p0/m, z0.s
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %op) %res = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %op)
ret <2 x i32> %res ret <2 x i32> %res
} }
define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) #0 { define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v4i32: ; CHECK-LABEL: bitreverse_v4i32:
; CHECK: ptrue [[PG:p[0-9]+]].s, vl4 ; CHECK: // %bb.0:
; CHECK-NEXT: rbit z0.s, [[PG]]/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ret ; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: rbit z0.s, p0/m, z0.s
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %op) %res = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %op)
ret <4 x i32> %res ret <4 x i32> %res
} }
define void @bitreverse_v8i32(<8 x i32>* %a) #0 { define void @bitreverse_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v8i32: ; CHECK-LABEL: bitreverse_v8i32:
; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 ; CHECK: // %bb.0:
; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: rbit [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; CHECK-NEXT: rbit z0.s, p0/m, z0.s
; CHECK-NEXT: ret ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op = load <8 x i32>, <8 x i32>* %a %op = load <8 x i32>, <8 x i32>* %a
%res = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %op) %res = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %op)
store <8 x i32> %res, <8 x i32>* %a store <8 x i32> %res, <8 x i32>* %a
@ -218,80 +232,91 @@ define void @bitreverse_v8i32(<8 x i32>* %a) #0 {
} }
define void @bitreverse_v16i32(<16 x i32>* %a) #0 { define void @bitreverse_v16i32(<16 x i32>* %a) #0 {
; CHECK-LABEL: bitreverse_v16i32: ; VBITS_GE_256-LABEL: bitreverse_v16i32:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_512-NEXT: rbit [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: rbit z0.s, p0/m, z0.s
; Ensure sensible type legalisation. ; VBITS_GE_256-NEXT: rbit z1.s, p0/m, z1.s
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: ret
; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2] ;
; VBITS_EQ_256-DAG: rbit [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s ; VBITS_GE_512-LABEL: bitreverse_v16i32:
; VBITS_EQ_256-DAG: rbit [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s ; VBITS_GE_512: // %bb.0:
; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2] ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_EQ_256-NEXT: ret ; VBITS_GE_512-NEXT: rbit z0.s, p0/m, z0.s
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op = load <16 x i32>, <16 x i32>* %a %op = load <16 x i32>, <16 x i32>* %a
%res = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %op) %res = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %op)
store <16 x i32> %res, <16 x i32>* %a store <16 x i32> %res, <16 x i32>* %a
ret void ret void
} }
define void @bitreverse_v32i32(<32 x i32>* %a) #0 { define void @bitreverse_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: bitreverse_v32i32: ; CHECK-LABEL: bitreverse_v32i32:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: rbit [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; CHECK-NEXT: rbit z0.s, p0/m, z0.s
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op = load <32 x i32>, <32 x i32>* %a %op = load <32 x i32>, <32 x i32>* %a
%res = call <32 x i32> @llvm.bitreverse.v32i32(<32 x i32> %op) %res = call <32 x i32> @llvm.bitreverse.v32i32(<32 x i32> %op)
store <32 x i32> %res, <32 x i32>* %a store <32 x i32> %res, <32 x i32>* %a
ret void ret void
} }
define void @bitreverse_v64i32(<64 x i32>* %a) #0 { define void @bitreverse_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: bitreverse_v64i32: ; CHECK-LABEL: bitreverse_v64i32:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: rbit [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; CHECK-NEXT: rbit z0.s, p0/m, z0.s
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op = load <64 x i32>, <64 x i32>* %a %op = load <64 x i32>, <64 x i32>* %a
%res = call <64 x i32> @llvm.bitreverse.v64i32(<64 x i32> %op) %res = call <64 x i32> @llvm.bitreverse.v64i32(<64 x i32> %op)
store <64 x i32> %res, <64 x i32>* %a store <64 x i32> %res, <64 x i32>* %a
ret void ret void
} }
define <1 x i64> @bitreverse_v1i64(<1 x i64> %op) #0 { define <1 x i64> @bitreverse_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v1i64: ; CHECK-LABEL: bitreverse_v1i64:
; CHECK: ptrue [[PG:p[0-9]+]].d, vl1 ; CHECK: // %bb.0:
; CHECK-NEXT: rbit z0.d, [[PG]]/m, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ret ; CHECK-NEXT: ptrue p0.d, vl1
; CHECK-NEXT: rbit z0.d, p0/m, z0.d
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = call <1 x i64> @llvm.bitreverse.v1i64(<1 x i64> %op) %res = call <1 x i64> @llvm.bitreverse.v1i64(<1 x i64> %op)
ret <1 x i64> %res ret <1 x i64> %res
} }
define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) #0 { define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v2i64: ; CHECK-LABEL: bitreverse_v2i64:
; CHECK: ptrue [[PG:p[0-9]+]].d, vl2 ; CHECK: // %bb.0:
; CHECK-NEXT: rbit z0.d, [[PG]]/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ret ; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: rbit z0.d, p0/m, z0.d
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %op) %res = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %op)
ret <2 x i64> %res ret <2 x i64> %res
} }
define void @bitreverse_v4i64(<4 x i64>* %a) #0 { define void @bitreverse_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: bitreverse_v4i64: ; CHECK-LABEL: bitreverse_v4i64:
; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 ; CHECK: // %bb.0:
; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: rbit [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; CHECK-NEXT: rbit z0.d, p0/m, z0.d
; CHECK-NEXT: ret ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op = load <4 x i64>, <4 x i64>* %a %op = load <4 x i64>, <4 x i64>* %a
%res = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %op) %res = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %op)
store <4 x i64> %res, <4 x i64>* %a store <4 x i64> %res, <4 x i64>* %a
@ -299,49 +324,53 @@ define void @bitreverse_v4i64(<4 x i64>* %a) #0 {
} }
define void @bitreverse_v8i64(<8 x i64>* %a) #0 { define void @bitreverse_v8i64(<8 x i64>* %a) #0 {
; CHECK-LABEL: bitreverse_v8i64: ; VBITS_GE_256-LABEL: bitreverse_v8i64:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_512-NEXT: rbit [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: rbit z0.d, p0/m, z0.d
; Ensure sensible type legalisation. ; VBITS_GE_256-NEXT: rbit z1.d, p0/m, z1.d
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: ret
; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3] ;
; VBITS_EQ_256-DAG: rbit [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d ; VBITS_GE_512-LABEL: bitreverse_v8i64:
; VBITS_EQ_256-DAG: rbit [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d ; VBITS_GE_512: // %bb.0:
; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3] ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_EQ_256-NEXT: ret ; VBITS_GE_512-NEXT: rbit z0.d, p0/m, z0.d
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op = load <8 x i64>, <8 x i64>* %a %op = load <8 x i64>, <8 x i64>* %a
%res = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %op) %res = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %op)
store <8 x i64> %res, <8 x i64>* %a store <8 x i64> %res, <8 x i64>* %a
ret void ret void
} }
define void @bitreverse_v16i64(<16 x i64>* %a) #0 { define void @bitreverse_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: bitreverse_v16i64: ; CHECK-LABEL: bitreverse_v16i64:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: rbit [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; CHECK-NEXT: rbit z0.d, p0/m, z0.d
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op = load <16 x i64>, <16 x i64>* %a %op = load <16 x i64>, <16 x i64>* %a
%res = call <16 x i64> @llvm.bitreverse.v16i64(<16 x i64> %op) %res = call <16 x i64> @llvm.bitreverse.v16i64(<16 x i64> %op)
store <16 x i64> %res, <16 x i64>* %a store <16 x i64> %res, <16 x i64>* %a
ret void ret void
} }
define void @bitreverse_v32i64(<32 x i64>* %a) #0 { define void @bitreverse_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: bitreverse_v32i64: ; CHECK-LABEL: bitreverse_v32i64:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: rbit [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; CHECK-NEXT: rbit z0.d, p0/m, z0.d
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op = load <32 x i64>, <32 x i64>* %a %op = load <32 x i64>, <32 x i64>* %a
%res = call <32 x i64> @llvm.bitreverse.v32i64(<32 x i64> %op) %res = call <32 x i64> @llvm.bitreverse.v32i64(<32 x i64> %op)
store <32 x i64> %res, <32 x i64>* %a store <32 x i64> %res, <32 x i64>* %a
@ -353,30 +382,33 @@ define void @bitreverse_v32i64(<32 x i64>* %a) #0 {
; ;
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <4 x i16> @bswap_v4i16(<4 x i16> %op) #0 { define <4 x i16> @bswap_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v4i16: ; CHECK-LABEL: bswap_v4i16:
; CHECK: rev16 v0.8b, v0.8b ; CHECK: // %bb.0:
; CHECK-NEXT: ret ; CHECK-NEXT: rev16 v0.8b, v0.8b
; CHECK-NEXT: ret
%res = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %op) %res = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %op)
ret <4 x i16> %res ret <4 x i16> %res
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <8 x i16> @bswap_v8i16(<8 x i16> %op) #0 { define <8 x i16> @bswap_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v8i16: ; CHECK-LABEL: bswap_v8i16:
; CHECK: rev16 v0.16b, v0.16b ; CHECK: // %bb.0:
; CHECK-NEXT: ret ; CHECK-NEXT: rev16 v0.16b, v0.16b
; CHECK-NEXT: ret
%res = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %op) %res = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %op)
ret <8 x i16> %res ret <8 x i16> %res
} }
define void @bswap_v16i16(<16 x i16>* %a) #0 { define void @bswap_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v16i16: ; CHECK-LABEL: bswap_v16i16:
; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 ; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: revb [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; CHECK-NEXT: revb z0.h, p0/m, z0.h
; CHECK-NEXT: ret ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op = load <16 x i16>, <16 x i16>* %a %op = load <16 x i16>, <16 x i16>* %a
%res = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %op) %res = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %op)
store <16 x i16> %res, <16 x i16>* %a store <16 x i16> %res, <16 x i16>* %a
@ -384,49 +416,53 @@ define void @bswap_v16i16(<16 x i16>* %a) #0 {
} }
define void @bswap_v32i16(<32 x i16>* %a) #0 { define void @bswap_v32i16(<32 x i16>* %a) #0 {
; CHECK-LABEL: bswap_v32i16: ; VBITS_GE_256-LABEL: bswap_v32i16:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_512-NEXT: revb [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: revb z0.h, p0/m, z0.h
; Ensure sensible type legalisation. ; VBITS_GE_256-NEXT: revb z1.h, p0/m, z1.h
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_EQ_256-DAG: ld1h { [[OP_LO:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: ret
; VBITS_EQ_256-DAG: ld1h { [[OP_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1] ;
; VBITS_EQ_256-DAG: revb [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP_LO]].h ; VBITS_GE_512-LABEL: bswap_v32i16:
; VBITS_EQ_256-DAG: revb [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP_HI]].h ; VBITS_GE_512: // %bb.0:
; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1] ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_EQ_256-NEXT: ret ; VBITS_GE_512-NEXT: revb z0.h, p0/m, z0.h
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op = load <32 x i16>, <32 x i16>* %a %op = load <32 x i16>, <32 x i16>* %a
%res = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %op) %res = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %op)
store <32 x i16> %res, <32 x i16>* %a store <32 x i16> %res, <32 x i16>* %a
ret void ret void
} }
define void @bswap_v64i16(<64 x i16>* %a) #0 { define void @bswap_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: bswap_v64i16: ; CHECK-LABEL: bswap_v64i16:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: revb [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; CHECK-NEXT: revb z0.h, p0/m, z0.h
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op = load <64 x i16>, <64 x i16>* %a %op = load <64 x i16>, <64 x i16>* %a
%res = call <64 x i16> @llvm.bswap.v64i16(<64 x i16> %op) %res = call <64 x i16> @llvm.bswap.v64i16(<64 x i16> %op)
store <64 x i16> %res, <64 x i16>* %a store <64 x i16> %res, <64 x i16>* %a
ret void ret void
} }
define void @bswap_v128i16(<128 x i16>* %a) #0 { define void @bswap_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: bswap_v128i16: ; CHECK-LABEL: bswap_v128i16:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048-NEXT: revb [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0] ; CHECK-NEXT: revb z0.h, p0/m, z0.h
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op = load <128 x i16>, <128 x i16>* %a %op = load <128 x i16>, <128 x i16>* %a
%res = call <128 x i16> @llvm.bswap.v128i16(<128 x i16> %op) %res = call <128 x i16> @llvm.bswap.v128i16(<128 x i16> %op)
store <128 x i16> %res, <128 x i16>* %a store <128 x i16> %res, <128 x i16>* %a
@ -434,30 +470,33 @@ define void @bswap_v128i16(<128 x i16>* %a) #0 {
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <2 x i32> @bswap_v2i32(<2 x i32> %op) #0 { define <2 x i32> @bswap_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v2i32: ; CHECK-LABEL: bswap_v2i32:
; CHECK: rev32 v0.8b, v0.8b ; CHECK: // %bb.0:
; CHECK-NEXT: ret ; CHECK-NEXT: rev32 v0.8b, v0.8b
; CHECK-NEXT: ret
%res = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %op) %res = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %op)
ret <2 x i32> %res ret <2 x i32> %res
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <4 x i32> @bswap_v4i32(<4 x i32> %op) #0 { define <4 x i32> @bswap_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v4i32: ; CHECK-LABEL: bswap_v4i32:
; CHECK: rev32 v0.16b, v0.16b ; CHECK: // %bb.0:
; CHECK-NEXT: ret ; CHECK-NEXT: rev32 v0.16b, v0.16b
; CHECK-NEXT: ret
%res = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %op) %res = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %op)
ret <4 x i32> %res ret <4 x i32> %res
} }
define void @bswap_v8i32(<8 x i32>* %a) #0 { define void @bswap_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v8i32: ; CHECK-LABEL: bswap_v8i32:
; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 ; CHECK: // %bb.0:
; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: revb [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; CHECK-NEXT: revb z0.s, p0/m, z0.s
; CHECK-NEXT: ret ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op = load <8 x i32>, <8 x i32>* %a %op = load <8 x i32>, <8 x i32>* %a
%res = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %op) %res = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %op)
store <8 x i32> %res, <8 x i32>* %a store <8 x i32> %res, <8 x i32>* %a
@ -465,49 +504,53 @@ define void @bswap_v8i32(<8 x i32>* %a) #0 {
} }
define void @bswap_v16i32(<16 x i32>* %a) #0 { define void @bswap_v16i32(<16 x i32>* %a) #0 {
; CHECK-LABEL: bswap_v16i32: ; VBITS_GE_256-LABEL: bswap_v16i32:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_512-NEXT: revb [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: revb z0.s, p0/m, z0.s
; Ensure sensible type legalisation. ; VBITS_GE_256-NEXT: revb z1.s, p0/m, z1.s
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_EQ_256-DAG: ld1w { [[OP_LO:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: ret
; VBITS_EQ_256-DAG: ld1w { [[OP_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2] ;
; VBITS_EQ_256-DAG: revb [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP_LO]].s ; VBITS_GE_512-LABEL: bswap_v16i32:
; VBITS_EQ_256-DAG: revb [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP_HI]].s ; VBITS_GE_512: // %bb.0:
; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2] ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_EQ_256-NEXT: ret ; VBITS_GE_512-NEXT: revb z0.s, p0/m, z0.s
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op = load <16 x i32>, <16 x i32>* %a %op = load <16 x i32>, <16 x i32>* %a
%res = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %op) %res = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %op)
store <16 x i32> %res, <16 x i32>* %a store <16 x i32> %res, <16 x i32>* %a
ret void ret void
} }
define void @bswap_v32i32(<32 x i32>* %a) #0 { define void @bswap_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: bswap_v32i32: ; CHECK-LABEL: bswap_v32i32:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: revb [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; CHECK-NEXT: revb z0.s, p0/m, z0.s
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op = load <32 x i32>, <32 x i32>* %a %op = load <32 x i32>, <32 x i32>* %a
%res = call <32 x i32> @llvm.bswap.v32i32(<32 x i32> %op) %res = call <32 x i32> @llvm.bswap.v32i32(<32 x i32> %op)
store <32 x i32> %res, <32 x i32>* %a store <32 x i32> %res, <32 x i32>* %a
ret void ret void
} }
define void @bswap_v64i32(<64 x i32>* %a) #0 { define void @bswap_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: bswap_v64i32: ; CHECK-LABEL: bswap_v64i32:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: revb [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; CHECK-NEXT: revb z0.s, p0/m, z0.s
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op = load <64 x i32>, <64 x i32>* %a %op = load <64 x i32>, <64 x i32>* %a
%res = call <64 x i32> @llvm.bswap.v64i32(<64 x i32> %op) %res = call <64 x i32> @llvm.bswap.v64i32(<64 x i32> %op)
store <64 x i32> %res, <64 x i32>* %a store <64 x i32> %res, <64 x i32>* %a
@ -515,30 +558,33 @@ define void @bswap_v64i32(<64 x i32>* %a) #0 {
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <1 x i64> @bswap_v1i64(<1 x i64> %op) #0 { define <1 x i64> @bswap_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v1i64: ; CHECK-LABEL: bswap_v1i64:
; CHECK: rev64 v0.8b, v0.8b ; CHECK: // %bb.0:
; CHECK-NEXT: ret ; CHECK-NEXT: rev64 v0.8b, v0.8b
; CHECK-NEXT: ret
%res = call <1 x i64> @llvm.bswap.v1i64(<1 x i64> %op) %res = call <1 x i64> @llvm.bswap.v1i64(<1 x i64> %op)
ret <1 x i64> %res ret <1 x i64> %res
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <2 x i64> @bswap_v2i64(<2 x i64> %op) #0 { define <2 x i64> @bswap_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v2i64: ; CHECK-LABEL: bswap_v2i64:
; CHECK: rev64 v0.16b, v0.16b ; CHECK: // %bb.0:
; CHECK-NEXT: ret ; CHECK-NEXT: rev64 v0.16b, v0.16b
; CHECK-NEXT: ret
%res = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %op) %res = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %op)
ret <2 x i64> %res ret <2 x i64> %res
} }
define void @bswap_v4i64(<4 x i64>* %a) #0 { define void @bswap_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: bswap_v4i64: ; CHECK-LABEL: bswap_v4i64:
; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 ; CHECK: // %bb.0:
; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: revb [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; CHECK-NEXT: revb z0.d, p0/m, z0.d
; CHECK-NEXT: ret ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op = load <4 x i64>, <4 x i64>* %a %op = load <4 x i64>, <4 x i64>* %a
%res = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %op) %res = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %op)
store <4 x i64> %res, <4 x i64>* %a store <4 x i64> %res, <4 x i64>* %a
@ -546,49 +592,53 @@ define void @bswap_v4i64(<4 x i64>* %a) #0 {
} }
define void @bswap_v8i64(<8 x i64>* %a) #0 { define void @bswap_v8i64(<8 x i64>* %a) #0 {
; CHECK-LABEL: bswap_v8i64: ; VBITS_GE_256-LABEL: bswap_v8i64:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_512-NEXT: revb [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: revb z0.d, p0/m, z0.d
; Ensure sensible type legalisation. ; VBITS_GE_256-NEXT: revb z1.d, p0/m, z1.d
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_EQ_256-DAG: ld1d { [[OP_LO:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: ret
; VBITS_EQ_256-DAG: ld1d { [[OP_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3] ;
; VBITS_EQ_256-DAG: revb [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP_LO]].d ; VBITS_GE_512-LABEL: bswap_v8i64:
; VBITS_EQ_256-DAG: revb [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP_HI]].d ; VBITS_GE_512: // %bb.0:
; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3] ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_EQ_256-NEXT: ret ; VBITS_GE_512-NEXT: revb z0.d, p0/m, z0.d
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op = load <8 x i64>, <8 x i64>* %a %op = load <8 x i64>, <8 x i64>* %a
%res = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %op) %res = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %op)
store <8 x i64> %res, <8 x i64>* %a store <8 x i64> %res, <8 x i64>* %a
ret void ret void
} }
define void @bswap_v16i64(<16 x i64>* %a) #0 { define void @bswap_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
; CHECK-LABEL: bswap_v16i64: ; CHECK-LABEL: bswap_v16i64:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: revb [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; CHECK-NEXT: revb z0.d, p0/m, z0.d
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op = load <16 x i64>, <16 x i64>* %a %op = load <16 x i64>, <16 x i64>* %a
%res = call <16 x i64> @llvm.bswap.v16i64(<16 x i64> %op) %res = call <16 x i64> @llvm.bswap.v16i64(<16 x i64> %op)
store <16 x i64> %res, <16 x i64>* %a store <16 x i64> %res, <16 x i64>* %a
ret void ret void
} }
define void @bswap_v32i64(<32 x i64>* %a) #0 { define void @bswap_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
; CHECK-LABEL: bswap_v32i64: ; CHECK-LABEL: bswap_v32i64:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: revb [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0] ; CHECK-NEXT: revb z0.d, p0/m, z0.d
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op = load <32 x i64>, <32 x i64>* %a %op = load <32 x i64>, <32 x i64>* %a
%res = call <32 x i64> @llvm.bswap.v32i64(<32 x i64> %op) %res = call <32 x i64> @llvm.bswap.v32i64(<32 x i64> %op)
store <32 x i64> %res, <32 x i64>* %a store <32 x i64> %res, <32 x i64>* %a
@ -640,4 +690,3 @@ declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)
declare <8 x i64> @llvm.bswap.v8i64(<8 x i64>) declare <8 x i64> @llvm.bswap.v8i64(<8 x i64>)
declare <16 x i64> @llvm.bswap.v16i64(<16 x i64>) declare <16 x i64> @llvm.bswap.v16i64(<16 x i64>)
declare <32 x i64> @llvm.bswap.v32i64(<32 x i64>) declare <32 x i64> @llvm.bswap.v32i64(<32 x i64>)

View File

@ -1,23 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
target triple = "aarch64-unknown-linux-gnu" target triple = "aarch64-unknown-linux-gnu"
define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) #0 { define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v8i8: ; CHECK-LABEL: sdiv_v8i8:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@ -29,7 +17,7 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) #0 {
ret <8 x i8> %res ret <8 x i8> %res
} }
define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) #0 { define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v16i8: ; CHECK-LABEL: sdiv_v16i8:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@ -41,7 +29,7 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) #0 {
ret <16 x i8> %res ret <16 x i8> %res
} }
define void @sdiv_v32i8(<32 x i8>* %a) #0 { define void @sdiv_v32i8(<32 x i8>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v32i8: ; CHECK-LABEL: sdiv_v32i8:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: ptrue p0.b, vl32
@ -81,91 +69,35 @@ define void @sdiv_v64i8(<64 x i8>* %a) #0 {
ret void ret void
} }
define void @sdiv_v128i8(<128 x i8>* %a) #0 { define void @sdiv_v128i8(<128 x i8>* %a) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: sdiv_v128i8: ; CHECK-LABEL: sdiv_v128i8:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov w8, #96 ; CHECK-NEXT: ptrue p0.b, vl128
; VBITS_GE_256-NEXT: mov w9, #32 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov w10, #64 ; CHECK-NEXT: asrd z0.b, p0/m, z0.b, #5
; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9]
; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x10]
; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0]
; VBITS_GE_256-NEXT: asrd z1.b, p0/m, z1.b, #5
; VBITS_GE_256-NEXT: asrd z0.b, p0/m, z0.b, #5
; VBITS_GE_256-NEXT: asrd z2.b, p0/m, z2.b, #5
; VBITS_GE_256-NEXT: asrd z3.b, p0/m, z3.b, #5
; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x10]
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x9]
; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: sdiv_v128i8:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.b, vl128
; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_1024-NEXT: asrd z0.b, p0/m, z0.b, #5
; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a %op1 = load <128 x i8>, <128 x i8>* %a
%res = sdiv <128 x i8> %op1, shufflevector (<128 x i8> insertelement (<128 x i8> poison, i8 32, i32 0), <128 x i8> poison, <128 x i32> zeroinitializer) %res = sdiv <128 x i8> %op1, shufflevector (<128 x i8> insertelement (<128 x i8> poison, i8 32, i32 0), <128 x i8> poison, <128 x i32> zeroinitializer)
store <128 x i8> %res, <128 x i8>* %a store <128 x i8> %res, <128 x i8>* %a
ret void ret void
} }
define void @sdiv_v256i8(<256 x i8>* %a) #0 { define void @sdiv_v256i8(<256 x i8>* %a) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: sdiv_v256i8: ; CHECK-LABEL: sdiv_v256i8:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov w8, #192 ; CHECK-NEXT: ptrue p0.b, vl256
; VBITS_GE_256-NEXT: mov w9, #96 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov w10, #32 ; CHECK-NEXT: asrd z0.b, p0/m, z0.b, #5
; VBITS_GE_256-NEXT: mov w11, #160 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; VBITS_GE_256-NEXT: mov w12, #64 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov w13, #224
; VBITS_GE_256-NEXT: mov w14, #128
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x9]
; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x10]
; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x11]
; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0, x12]
; VBITS_GE_256-NEXT: ld1b { z4.b }, p0/z, [x0, x13]
; VBITS_GE_256-NEXT: ld1b { z5.b }, p0/z, [x0, x14]
; VBITS_GE_256-NEXT: ld1b { z6.b }, p0/z, [x0, x8]
; VBITS_GE_256-NEXT: ld1b { z7.b }, p0/z, [x0]
; VBITS_GE_256-NEXT: asrd z1.b, p0/m, z1.b, #5
; VBITS_GE_256-NEXT: asrd z0.b, p0/m, z0.b, #5
; VBITS_GE_256-NEXT: asrd z3.b, p0/m, z3.b, #5
; VBITS_GE_256-NEXT: asrd z2.b, p0/m, z2.b, #5
; VBITS_GE_256-NEXT: asrd z5.b, p0/m, z5.b, #5
; VBITS_GE_256-NEXT: asrd z4.b, p0/m, z4.b, #5
; VBITS_GE_256-NEXT: asrd z6.b, p0/m, z6.b, #5
; VBITS_GE_256-NEXT: asrd z7.b, p0/m, z7.b, #5
; VBITS_GE_256-NEXT: st1b { z6.b }, p0, [x0, x8]
; VBITS_GE_256-NEXT: st1b { z4.b }, p0, [x0, x13]
; VBITS_GE_256-NEXT: st1b { z5.b }, p0, [x0, x14]
; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x11]
; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x12]
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x9]
; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x10]
; VBITS_GE_256-NEXT: st1b { z7.b }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: sdiv_v256i8:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.b, vl256
; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_2048-NEXT: asrd z0.b, p0/m, z0.b, #5
; VBITS_GE_2048-NEXT: st1b { z0.b }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a %op1 = load <256 x i8>, <256 x i8>* %a
%res = sdiv <256 x i8> %op1, shufflevector (<256 x i8> insertelement (<256 x i8> poison, i8 32, i32 0), <256 x i8> poison, <256 x i32> zeroinitializer) %res = sdiv <256 x i8> %op1, shufflevector (<256 x i8> insertelement (<256 x i8> poison, i8 32, i32 0), <256 x i8> poison, <256 x i32> zeroinitializer)
store <256 x i8> %res, <256 x i8>* %a store <256 x i8> %res, <256 x i8>* %a
ret void ret void
} }
define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) #0 { define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v4i16: ; CHECK-LABEL: sdiv_v4i16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@ -177,7 +109,7 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) #0 {
ret <4 x i16> %res ret <4 x i16> %res
} }
define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) #0 { define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v8i16: ; CHECK-LABEL: sdiv_v8i16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@ -189,7 +121,7 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) #0 {
ret <8 x i16> %res ret <8 x i16> %res
} }
define void @sdiv_v16i16(<16 x i16>* %a) #0 { define void @sdiv_v16i16(<16 x i16>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v16i16: ; CHECK-LABEL: sdiv_v16i16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -229,91 +161,35 @@ define void @sdiv_v32i16(<32 x i16>* %a) #0 {
ret void ret void
} }
define void @sdiv_v64i16(<64 x i16>* %a) #0 { define void @sdiv_v64i16(<64 x i16>* %a) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: sdiv_v64i16: ; CHECK-LABEL: sdiv_v64i16:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #48 ; CHECK-NEXT: ptrue p0.h, vl64
; VBITS_GE_256-NEXT: mov x9, #16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #32 ; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #5
; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: asrd z1.h, p0/m, z1.h, #5
; VBITS_GE_256-NEXT: asrd z0.h, p0/m, z0.h, #5
; VBITS_GE_256-NEXT: asrd z2.h, p0/m, z2.h, #5
; VBITS_GE_256-NEXT: asrd z3.h, p0/m, z3.h, #5
; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: sdiv_v64i16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: asrd z0.h, p0/m, z0.h, #5
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a %op1 = load <64 x i16>, <64 x i16>* %a
%res = sdiv <64 x i16> %op1, shufflevector (<64 x i16> insertelement (<64 x i16> poison, i16 32, i32 0), <64 x i16> poison, <64 x i32> zeroinitializer) %res = sdiv <64 x i16> %op1, shufflevector (<64 x i16> insertelement (<64 x i16> poison, i16 32, i32 0), <64 x i16> poison, <64 x i32> zeroinitializer)
store <64 x i16> %res, <64 x i16>* %a store <64 x i16> %res, <64 x i16>* %a
ret void ret void
} }
define void @sdiv_v128i16(<128 x i16>* %a) #0 { define void @sdiv_v128i16(<128 x i16>* %a) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: sdiv_v128i16: ; CHECK-LABEL: sdiv_v128i16:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #96 ; CHECK-NEXT: ptrue p0.h, vl128
; VBITS_GE_256-NEXT: mov x9, #48 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #16 ; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #5
; VBITS_GE_256-NEXT: mov x11, #80 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_256-NEXT: mov x12, #32 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov x13, #112
; VBITS_GE_256-NEXT: mov x14, #64
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x11, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x12, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0, x13, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x0, x14, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: asrd z1.h, p0/m, z1.h, #5
; VBITS_GE_256-NEXT: asrd z0.h, p0/m, z0.h, #5
; VBITS_GE_256-NEXT: asrd z3.h, p0/m, z3.h, #5
; VBITS_GE_256-NEXT: asrd z2.h, p0/m, z2.h, #5
; VBITS_GE_256-NEXT: asrd z5.h, p0/m, z5.h, #5
; VBITS_GE_256-NEXT: asrd z4.h, p0/m, z4.h, #5
; VBITS_GE_256-NEXT: asrd z6.h, p0/m, z6.h, #5
; VBITS_GE_256-NEXT: asrd z7.h, p0/m, z7.h, #5
; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x13, lsl #1]
; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x0, x14, lsl #1]
; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x11, lsl #1]
; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x12, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: sdiv_v128i16:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: asrd z0.h, p0/m, z0.h, #5
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a %op1 = load <128 x i16>, <128 x i16>* %a
%res = sdiv <128 x i16> %op1, shufflevector (<128 x i16> insertelement (<128 x i16> poison, i16 32, i32 0), <128 x i16> poison, <128 x i32> zeroinitializer) %res = sdiv <128 x i16> %op1, shufflevector (<128 x i16> insertelement (<128 x i16> poison, i16 32, i32 0), <128 x i16> poison, <128 x i32> zeroinitializer)
store <128 x i16> %res, <128 x i16>* %a store <128 x i16> %res, <128 x i16>* %a
ret void ret void
} }
define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) #0 { define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v2i32: ; CHECK-LABEL: sdiv_v2i32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@ -325,7 +201,7 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) #0 {
ret <2 x i32> %res ret <2 x i32> %res
} }
define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) #0 { define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v4i32: ; CHECK-LABEL: sdiv_v4i32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@ -337,7 +213,7 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) #0 {
ret <4 x i32> %res ret <4 x i32> %res
} }
define void @sdiv_v8i32(<8 x i32>* %a) #0 { define void @sdiv_v8i32(<8 x i32>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v8i32: ; CHECK-LABEL: sdiv_v8i32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ptrue p0.s, vl8
@ -377,91 +253,35 @@ define void @sdiv_v16i32(<16 x i32>* %a) #0 {
ret void ret void
} }
define void @sdiv_v32i32(<32 x i32>* %a) #0 { define void @sdiv_v32i32(<32 x i32>* %a) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: sdiv_v32i32: ; CHECK-LABEL: sdiv_v32i32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #24 ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_256-NEXT: mov x9, #8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #16 ; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #5
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: asrd z1.s, p0/m, z1.s, #5
; VBITS_GE_256-NEXT: asrd z0.s, p0/m, z0.s, #5
; VBITS_GE_256-NEXT: asrd z2.s, p0/m, z2.s, #5
; VBITS_GE_256-NEXT: asrd z3.s, p0/m, z3.s, #5
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: sdiv_v32i32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: asrd z0.s, p0/m, z0.s, #5
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a %op1 = load <32 x i32>, <32 x i32>* %a
%res = sdiv <32 x i32> %op1, shufflevector (<32 x i32> insertelement (<32 x i32> poison, i32 32, i32 0), <32 x i32> poison, <32 x i32> zeroinitializer) %res = sdiv <32 x i32> %op1, shufflevector (<32 x i32> insertelement (<32 x i32> poison, i32 32, i32 0), <32 x i32> poison, <32 x i32> zeroinitializer)
store <32 x i32> %res, <32 x i32>* %a store <32 x i32> %res, <32 x i32>* %a
ret void ret void
} }
define void @sdiv_v64i32(<64 x i32>* %a) #0 { define void @sdiv_v64i32(<64 x i32>* %a) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: sdiv_v64i32: ; CHECK-LABEL: sdiv_v64i32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #48 ; CHECK-NEXT: ptrue p0.s, vl64
; VBITS_GE_256-NEXT: mov x9, #24 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #8 ; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #5
; VBITS_GE_256-NEXT: mov x11, #40 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_256-NEXT: mov x12, #16 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov x13, #56
; VBITS_GE_256-NEXT: mov x14, #32
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x13, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x14, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: asrd z1.s, p0/m, z1.s, #5
; VBITS_GE_256-NEXT: asrd z0.s, p0/m, z0.s, #5
; VBITS_GE_256-NEXT: asrd z3.s, p0/m, z3.s, #5
; VBITS_GE_256-NEXT: asrd z2.s, p0/m, z2.s, #5
; VBITS_GE_256-NEXT: asrd z5.s, p0/m, z5.s, #5
; VBITS_GE_256-NEXT: asrd z4.s, p0/m, z4.s, #5
; VBITS_GE_256-NEXT: asrd z6.s, p0/m, z6.s, #5
; VBITS_GE_256-NEXT: asrd z7.s, p0/m, z7.s, #5
; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x13, lsl #2]
; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x0, x14, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: sdiv_v64i32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: asrd z0.s, p0/m, z0.s, #5
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a %op1 = load <64 x i32>, <64 x i32>* %a
%res = sdiv <64 x i32> %op1, shufflevector (<64 x i32> insertelement (<64 x i32> poison, i32 32, i32 0), <64 x i32> poison, <64 x i32> zeroinitializer) %res = sdiv <64 x i32> %op1, shufflevector (<64 x i32> insertelement (<64 x i32> poison, i32 32, i32 0), <64 x i32> poison, <64 x i32> zeroinitializer)
store <64 x i32> %res, <64 x i32>* %a store <64 x i32> %res, <64 x i32>* %a
ret void ret void
} }
define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) #0 { define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v1i64: ; CHECK-LABEL: sdiv_v1i64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@ -474,7 +294,7 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) #0 {
} }
; Vector i64 sdiv are not legal for NEON so use SVE when available. ; Vector i64 sdiv are not legal for NEON so use SVE when available.
define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) #0 { define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v2i64: ; CHECK-LABEL: sdiv_v2i64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@ -486,7 +306,7 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) #0 {
ret <2 x i64> %res ret <2 x i64> %res
} }
define void @sdiv_v4i64(<4 x i64>* %a) #0 { define void @sdiv_v4i64(<4 x i64>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v4i64: ; CHECK-LABEL: sdiv_v4i64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ptrue p0.d, vl4
@ -526,84 +346,28 @@ define void @sdiv_v8i64(<8 x i64>* %a) #0 {
ret void ret void
} }
define void @sdiv_v16i64(<16 x i64>* %a) #0 { define void @sdiv_v16i64(<16 x i64>* %a) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: sdiv_v16i64: ; CHECK-LABEL: sdiv_v16i64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #12 ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_256-NEXT: mov x9, #4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #8 ; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #5
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: asrd z1.d, p0/m, z1.d, #5
; VBITS_GE_256-NEXT: asrd z0.d, p0/m, z0.d, #5
; VBITS_GE_256-NEXT: asrd z2.d, p0/m, z2.d, #5
; VBITS_GE_256-NEXT: asrd z3.d, p0/m, z3.d, #5
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: sdiv_v16i64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: asrd z0.d, p0/m, z0.d, #5
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a %op1 = load <16 x i64>, <16 x i64>* %a
%res = sdiv <16 x i64> %op1, shufflevector (<16 x i64> insertelement (<16 x i64> poison, i64 32, i32 0), <16 x i64> poison, <16 x i32> zeroinitializer) %res = sdiv <16 x i64> %op1, shufflevector (<16 x i64> insertelement (<16 x i64> poison, i64 32, i32 0), <16 x i64> poison, <16 x i32> zeroinitializer)
store <16 x i64> %res, <16 x i64>* %a store <16 x i64> %res, <16 x i64>* %a
ret void ret void
} }
define void @sdiv_v32i64(<32 x i64>* %a) #0 { define void @sdiv_v32i64(<32 x i64>* %a) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: sdiv_v32i64: ; CHECK-LABEL: sdiv_v32i64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #24 ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_256-NEXT: mov x9, #12 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #4 ; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #5
; VBITS_GE_256-NEXT: mov x11, #20 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_256-NEXT: mov x12, #8 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov x13, #28
; VBITS_GE_256-NEXT: mov x14, #16
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x12, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x13, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: asrd z1.d, p0/m, z1.d, #5
; VBITS_GE_256-NEXT: asrd z0.d, p0/m, z0.d, #5
; VBITS_GE_256-NEXT: asrd z3.d, p0/m, z3.d, #5
; VBITS_GE_256-NEXT: asrd z2.d, p0/m, z2.d, #5
; VBITS_GE_256-NEXT: asrd z5.d, p0/m, z5.d, #5
; VBITS_GE_256-NEXT: asrd z4.d, p0/m, z4.d, #5
; VBITS_GE_256-NEXT: asrd z6.d, p0/m, z6.d, #5
; VBITS_GE_256-NEXT: asrd z7.d, p0/m, z7.d, #5
; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x13, lsl #3]
; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x14, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x12, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: sdiv_v32i64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: asrd z0.d, p0/m, z0.d, #5
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a %op1 = load <32 x i64>, <32 x i64>* %a
%res = sdiv <32 x i64> %op1, shufflevector (<32 x i64> insertelement (<32 x i64> poison, i64 32, i32 0), <32 x i64> poison, <32 x i32> zeroinitializer) %res = sdiv <32 x i64> %op1, shufflevector (<32 x i64> insertelement (<32 x i64> poison, i64 32, i32 0), <32 x i64> poison, <32 x i32> zeroinitializer)
store <32 x i64> %res, <32 x i64>* %a store <32 x i64> %res, <32 x i64>* %a

View File

@ -7,7 +7,7 @@ target triple = "aarch64-unknown-linux-gnu"
; bigger than NEON. However, having no support opens us up to a code generator ; bigger than NEON. However, having no support opens us up to a code generator
; hang when expanding BUILD_VECTOR. Here we just validate the promblematic case ; hang when expanding BUILD_VECTOR. Here we just validate the promblematic case
; successfully exits code generation. ; successfully exits code generation.
define void @hang_when_merging_stores_after_legalisation(<8 x i32>* %a, <2 x i32> %b) #0 { define void @hang_when_merging_stores_after_legalisation(<8 x i32>* %a, <2 x i32> %b) vscale_range(2,2) #0 {
; CHECK-LABEL: hang_when_merging_stores_after_legalisation: ; CHECK-LABEL: hang_when_merging_stores_after_legalisation:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
@ -37,8 +37,8 @@ define void @hang_when_merging_stores_after_legalisation(<8 x i32>* %a, <2 x i32
ret void ret void
} }
; Ensure we don't crash when trying to lower a shuffle via and extract ; Ensure we don't crash when trying to lower a shuffle via an extract
define void @crash_when_lowering_extract_shuffle(<32 x i32>* %dst, i1 %cond) #0 { define void @crash_when_lowering_extract_shuffle(<32 x i32>* %dst, i1 %cond) vscale_range(2,2) #0 {
; CHECK-LABEL: crash_when_lowering_extract_shuffle: ; CHECK-LABEL: crash_when_lowering_extract_shuffle:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: tbnz w1, #0, .LBB1_2 ; CHECK-NEXT: tbnz w1, #0, .LBB1_2
@ -132,4 +132,4 @@ exit:
ret void ret void
} }
attributes #0 = { vscale_range(2,2) "target-features"="+sve" } attributes #0 = { "target-features"="+sve" }

View File

@ -1,21 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; Don't use SVE when its registers are no bigger than NEON.
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
target triple = "aarch64-unknown-linux-gnu" target triple = "aarch64-unknown-linux-gnu"
@ -24,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
; ;
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <8 x i8> @splat_v8i8(i8 %a) #0 { define <8 x i8> @splat_v8i8(i8 %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v8i8: ; CHECK-LABEL: splat_v8i8:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.8b, w0 ; CHECK-NEXT: dup v0.8b, w0
@ -35,7 +21,7 @@ define <8 x i8> @splat_v8i8(i8 %a) #0 {
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <16 x i8> @splat_v16i8(i8 %a) #0 { define <16 x i8> @splat_v16i8(i8 %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v16i8: ; CHECK-LABEL: splat_v16i8:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.16b, w0 ; CHECK-NEXT: dup v0.16b, w0
@ -45,7 +31,7 @@ define <16 x i8> @splat_v16i8(i8 %a) #0 {
ret <16 x i8> %splat ret <16 x i8> %splat
} }
define void @splat_v32i8(i8 %a, <32 x i8>* %b) #0 { define void @splat_v32i8(i8 %a, <32 x i8>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v32i8: ; CHECK-LABEL: splat_v32i8:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: ptrue p0.b, vl32
@ -74,68 +60,32 @@ define void @splat_v64i8(i8 %a, <64 x i8>* %b) #0 {
; VBITS_GE_512-NEXT: mov z0.b, w0 ; VBITS_GE_512-NEXT: mov z0.b, w0
; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x1] ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_512-NEXT: ret ; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation.
%insert = insertelement <64 x i8> undef, i8 %a, i64 0 %insert = insertelement <64 x i8> undef, i8 %a, i64 0
%splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
store <64 x i8> %splat, <64 x i8>* %b store <64 x i8> %splat, <64 x i8>* %b
ret void ret void
} }
define void @splat_v128i8(i8 %a, <128 x i8>* %b) #0 { define void @splat_v128i8(i8 %a, <128 x i8>* %b) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: splat_v128i8: ; CHECK-LABEL: splat_v128i8:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov w8, #96 ; CHECK-NEXT: ptrue p0.b, vl128
; VBITS_GE_256-NEXT: mov w9, #64 ; CHECK-NEXT: mov z0.b, w0
; VBITS_GE_256-NEXT: mov w10, #32 ; CHECK-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov z0.b, w0
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x8]
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x9]
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x10]
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: splat_v128i8:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.b, vl128
; VBITS_GE_1024-NEXT: mov z0.b, w0
; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%insert = insertelement <128 x i8> undef, i8 %a, i64 0 %insert = insertelement <128 x i8> undef, i8 %a, i64 0
%splat = shufflevector <128 x i8> %insert, <128 x i8> undef, <128 x i32> zeroinitializer %splat = shufflevector <128 x i8> %insert, <128 x i8> undef, <128 x i32> zeroinitializer
store <128 x i8> %splat, <128 x i8>* %b store <128 x i8> %splat, <128 x i8>* %b
ret void ret void
} }
define void @splat_v256i8(i8 %a, <256 x i8>* %b) #0 { define void @splat_v256i8(i8 %a, <256 x i8>* %b) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: splat_v256i8: ; CHECK-LABEL: splat_v256i8:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov w8, #224 ; CHECK-NEXT: ptrue p0.b, vl256
; VBITS_GE_256-NEXT: mov w9, #192 ; CHECK-NEXT: mov z0.b, w0
; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_256-NEXT: mov z0.b, w0 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov w10, #160
; VBITS_GE_256-NEXT: mov w11, #128
; VBITS_GE_256-NEXT: mov w12, #96
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x8]
; VBITS_GE_256-NEXT: mov w8, #64
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x9]
; VBITS_GE_256-NEXT: mov w9, #32
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x10]
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x11]
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x12]
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x8]
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x9]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: splat_v256i8:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.b, vl256
; VBITS_GE_2048-NEXT: mov z0.b, w0
; VBITS_GE_2048-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%insert = insertelement <256 x i8> undef, i8 %a, i64 0 %insert = insertelement <256 x i8> undef, i8 %a, i64 0
%splat = shufflevector <256 x i8> %insert, <256 x i8> undef, <256 x i32> zeroinitializer %splat = shufflevector <256 x i8> %insert, <256 x i8> undef, <256 x i32> zeroinitializer
store <256 x i8> %splat, <256 x i8>* %b store <256 x i8> %splat, <256 x i8>* %b
@ -143,7 +93,7 @@ define void @splat_v256i8(i8 %a, <256 x i8>* %b) #0 {
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <4 x i16> @splat_v4i16(i16 %a) #0 { define <4 x i16> @splat_v4i16(i16 %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v4i16: ; CHECK-LABEL: splat_v4i16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.4h, w0 ; CHECK-NEXT: dup v0.4h, w0
@ -154,7 +104,7 @@ define <4 x i16> @splat_v4i16(i16 %a) #0 {
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <8 x i16> @splat_v8i16(i16 %a) #0 { define <8 x i16> @splat_v8i16(i16 %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v8i16: ; CHECK-LABEL: splat_v8i16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.8h, w0 ; CHECK-NEXT: dup v0.8h, w0
@ -164,7 +114,7 @@ define <8 x i16> @splat_v8i16(i16 %a) #0 {
ret <8 x i16> %splat ret <8 x i16> %splat
} }
define void @splat_v16i16(i16 %a, <16 x i16>* %b) #0 { define void @splat_v16i16(i16 %a, <16 x i16>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v16i16: ; CHECK-LABEL: splat_v16i16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -193,68 +143,32 @@ define void @splat_v32i16(i16 %a, <32 x i16>* %b) #0 {
; VBITS_GE_512-NEXT: mov z0.h, w0 ; VBITS_GE_512-NEXT: mov z0.h, w0
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_512-NEXT: ret ; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation.
%insert = insertelement <32 x i16> undef, i16 %a, i64 0 %insert = insertelement <32 x i16> undef, i16 %a, i64 0
%splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
store <32 x i16> %splat, <32 x i16>* %b store <32 x i16> %splat, <32 x i16>* %b
ret void ret void
} }
define void @splat_v64i16(i16 %a, <64 x i16>* %b) #0 { define void @splat_v64i16(i16 %a, <64 x i16>* %b) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: splat_v64i16: ; CHECK-LABEL: splat_v64i16:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #48 ; CHECK-NEXT: ptrue p0.h, vl64
; VBITS_GE_256-NEXT: mov x9, #32 ; CHECK-NEXT: mov z0.h, w0
; VBITS_GE_256-NEXT: mov x10, #16 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov z0.h, w0
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: splat_v64i16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: mov z0.h, w0
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%insert = insertelement <64 x i16> undef, i16 %a, i64 0 %insert = insertelement <64 x i16> undef, i16 %a, i64 0
%splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer %splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer
store <64 x i16> %splat, <64 x i16>* %b store <64 x i16> %splat, <64 x i16>* %b
ret void ret void
} }
define void @splat_v128i16(i16 %a, <128 x i16>* %b) #0 { define void @splat_v128i16(i16 %a, <128 x i16>* %b) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: splat_v128i16: ; CHECK-LABEL: splat_v128i16:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #112 ; CHECK-NEXT: ptrue p0.h, vl128
; VBITS_GE_256-NEXT: mov x9, #96 ; CHECK-NEXT: mov z0.h, w0
; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_256-NEXT: mov z0.h, w0 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov x10, #80
; VBITS_GE_256-NEXT: mov x11, #64
; VBITS_GE_256-NEXT: mov x12, #48
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: mov x8, #32
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x11, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x12, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: splat_v128i16:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048-NEXT: mov z0.h, w0
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%insert = insertelement <128 x i16> undef, i16 %a, i64 0 %insert = insertelement <128 x i16> undef, i16 %a, i64 0
%splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer %splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer
store <128 x i16> %splat, <128 x i16>* %b store <128 x i16> %splat, <128 x i16>* %b
@ -262,7 +176,7 @@ define void @splat_v128i16(i16 %a, <128 x i16>* %b) #0 {
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <2 x i32> @splat_v2i32(i32 %a) #0 { define <2 x i32> @splat_v2i32(i32 %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v2i32: ; CHECK-LABEL: splat_v2i32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.2s, w0 ; CHECK-NEXT: dup v0.2s, w0
@ -273,7 +187,7 @@ define <2 x i32> @splat_v2i32(i32 %a) #0 {
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <4 x i32> @splat_v4i32(i32 %a) #0 { define <4 x i32> @splat_v4i32(i32 %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v4i32: ; CHECK-LABEL: splat_v4i32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.4s, w0 ; CHECK-NEXT: dup v0.4s, w0
@ -283,7 +197,7 @@ define <4 x i32> @splat_v4i32(i32 %a) #0 {
ret <4 x i32> %splat ret <4 x i32> %splat
} }
define void @splat_v8i32(i32 %a, <8 x i32>* %b) #0 { define void @splat_v8i32(i32 %a, <8 x i32>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v8i32: ; CHECK-LABEL: splat_v8i32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ptrue p0.s, vl8
@ -312,68 +226,32 @@ define void @splat_v16i32(i32 %a, <16 x i32>* %b) #0 {
; VBITS_GE_512-NEXT: mov z0.s, w0 ; VBITS_GE_512-NEXT: mov z0.s, w0
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_512-NEXT: ret ; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation.
%insert = insertelement <16 x i32> undef, i32 %a, i64 0 %insert = insertelement <16 x i32> undef, i32 %a, i64 0
%splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
store <16 x i32> %splat, <16 x i32>* %b store <16 x i32> %splat, <16 x i32>* %b
ret void ret void
} }
define void @splat_v32i32(i32 %a, <32 x i32>* %b) #0 { define void @splat_v32i32(i32 %a, <32 x i32>* %b) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: splat_v32i32: ; CHECK-LABEL: splat_v32i32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #24 ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_256-NEXT: mov x9, #16 ; CHECK-NEXT: mov z0.s, w0
; VBITS_GE_256-NEXT: mov x10, #8 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov z0.s, w0
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: splat_v32i32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: mov z0.s, w0
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%insert = insertelement <32 x i32> undef, i32 %a, i64 0 %insert = insertelement <32 x i32> undef, i32 %a, i64 0
%splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer %splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer
store <32 x i32> %splat, <32 x i32>* %b store <32 x i32> %splat, <32 x i32>* %b
ret void ret void
} }
define void @splat_v64i32(i32 %a, <64 x i32>* %b) #0 { define void @splat_v64i32(i32 %a, <64 x i32>* %b) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: splat_v64i32: ; CHECK-LABEL: splat_v64i32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #56 ; CHECK-NEXT: ptrue p0.s, vl64
; VBITS_GE_256-NEXT: mov x9, #48 ; CHECK-NEXT: mov z0.s, w0
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: mov z0.s, w0 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov x10, #40
; VBITS_GE_256-NEXT: mov x11, #32
; VBITS_GE_256-NEXT: mov x12, #24
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: splat_v64i32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: mov z0.s, w0
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%insert = insertelement <64 x i32> undef, i32 %a, i64 0 %insert = insertelement <64 x i32> undef, i32 %a, i64 0
%splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer %splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer
store <64 x i32> %splat, <64 x i32>* %b store <64 x i32> %splat, <64 x i32>* %b
@ -381,7 +259,7 @@ define void @splat_v64i32(i32 %a, <64 x i32>* %b) #0 {
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <1 x i64> @splat_v1i64(i64 %a) #0 { define <1 x i64> @splat_v1i64(i64 %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v1i64: ; CHECK-LABEL: splat_v1i64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: fmov d0, x0 ; CHECK-NEXT: fmov d0, x0
@ -392,7 +270,7 @@ define <1 x i64> @splat_v1i64(i64 %a) #0 {
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <2 x i64> @splat_v2i64(i64 %a) #0 { define <2 x i64> @splat_v2i64(i64 %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v2i64: ; CHECK-LABEL: splat_v2i64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.2d, x0 ; CHECK-NEXT: dup v0.2d, x0
@ -402,7 +280,7 @@ define <2 x i64> @splat_v2i64(i64 %a) #0 {
ret <2 x i64> %splat ret <2 x i64> %splat
} }
define void @splat_v4i64(i64 %a, <4 x i64>* %b) #0 { define void @splat_v4i64(i64 %a, <4 x i64>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v4i64: ; CHECK-LABEL: splat_v4i64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ptrue p0.d, vl4
@ -431,68 +309,32 @@ define void @splat_v8i64(i64 %a, <8 x i64>* %b) #0 {
; VBITS_GE_512-NEXT: mov z0.d, x0 ; VBITS_GE_512-NEXT: mov z0.d, x0
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret ; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation.
%insert = insertelement <8 x i64> undef, i64 %a, i64 0 %insert = insertelement <8 x i64> undef, i64 %a, i64 0
%splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
store <8 x i64> %splat, <8 x i64>* %b store <8 x i64> %splat, <8 x i64>* %b
ret void ret void
} }
define void @splat_v16i64(i64 %a, <16 x i64>* %b) #0 { define void @splat_v16i64(i64 %a, <16 x i64>* %b) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: splat_v16i64: ; CHECK-LABEL: splat_v16i64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #12 ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_256-NEXT: mov x9, #8 ; CHECK-NEXT: mov z0.d, x0
; VBITS_GE_256-NEXT: mov x10, #4 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov z0.d, x0
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: splat_v16i64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: mov z0.d, x0
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%insert = insertelement <16 x i64> undef, i64 %a, i64 0 %insert = insertelement <16 x i64> undef, i64 %a, i64 0
%splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer %splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer
store <16 x i64> %splat, <16 x i64>* %b store <16 x i64> %splat, <16 x i64>* %b
ret void ret void
} }
define void @splat_v32i64(i64 %a, <32 x i64>* %b) #0 { define void @splat_v32i64(i64 %a, <32 x i64>* %b) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: splat_v32i64: ; CHECK-LABEL: splat_v32i64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #28 ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_256-NEXT: mov x9, #24 ; CHECK-NEXT: mov z0.d, x0
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: mov z0.d, x0 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov x10, #20
; VBITS_GE_256-NEXT: mov x11, #16
; VBITS_GE_256-NEXT: mov x12, #12
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x9, #4
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x12, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: splat_v32i64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: mov z0.d, x0
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%insert = insertelement <32 x i64> undef, i64 %a, i64 0 %insert = insertelement <32 x i64> undef, i64 %a, i64 0
%splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer %splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer
store <32 x i64> %splat, <32 x i64>* %b store <32 x i64> %splat, <32 x i64>* %b
@ -504,7 +346,7 @@ define void @splat_v32i64(i64 %a, <32 x i64>* %b) #0 {
; ;
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <4 x half> @splat_v4f16(half %a) #0 { define <4 x half> @splat_v4f16(half %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v4f16: ; CHECK-LABEL: splat_v4f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0 ; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0
@ -516,7 +358,7 @@ define <4 x half> @splat_v4f16(half %a) #0 {
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <8 x half> @splat_v8f16(half %a) #0 { define <8 x half> @splat_v8f16(half %a) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v8f16: ; CHECK-LABEL: splat_v8f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0 ; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0
@ -527,7 +369,7 @@ define <8 x half> @splat_v8f16(half %a) #0 {
ret <8 x half> %splat ret <8 x half> %splat
} }
define void @splat_v16f16(half %a, <16 x half>* %b) #0 { define void @splat_v16f16(half %a, <16 x half>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v16f16: ; CHECK-LABEL: splat_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
@ -559,72 +401,34 @@ define void @splat_v32f16(half %a, <32 x half>* %b) #0 {
; VBITS_GE_512-NEXT: mov z0.h, h0 ; VBITS_GE_512-NEXT: mov z0.h, h0
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret ; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation.
%insert = insertelement <32 x half> undef, half %a, i64 0 %insert = insertelement <32 x half> undef, half %a, i64 0
%splat = shufflevector <32 x half> %insert, <32 x half> undef, <32 x i32> zeroinitializer %splat = shufflevector <32 x half> %insert, <32 x half> undef, <32 x i32> zeroinitializer
store <32 x half> %splat, <32 x half>* %b store <32 x half> %splat, <32 x half>* %b
ret void ret void
} }
define void @splat_v64f16(half %a, <64 x half>* %b) #0 { define void @splat_v64f16(half %a, <64 x half>* %b) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: splat_v64f16: ; CHECK-LABEL: splat_v64f16:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #48 ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
; VBITS_GE_256-NEXT: mov x9, #32 ; CHECK-NEXT: ptrue p0.h, vl64
; VBITS_GE_256-NEXT: mov x10, #16 ; CHECK-NEXT: mov z0.h, h0
; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov z0.h, h0
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: splat_v64f16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: // kill: def $h0 killed $h0 def $z0
; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: mov z0.h, h0
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
%insert = insertelement <64 x half> undef, half %a, i64 0 %insert = insertelement <64 x half> undef, half %a, i64 0
%splat = shufflevector <64 x half> %insert, <64 x half> undef, <64 x i32> zeroinitializer %splat = shufflevector <64 x half> %insert, <64 x half> undef, <64 x i32> zeroinitializer
store <64 x half> %splat, <64 x half>* %b store <64 x half> %splat, <64 x half>* %b
ret void ret void
} }
define void @splat_v128f16(half %a, <128 x half>* %b) #0 { define void @splat_v128f16(half %a, <128 x half>* %b) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: splat_v128f16: ; CHECK-LABEL: splat_v128f16:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #112 ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
; VBITS_GE_256-NEXT: mov x9, #96 ; CHECK-NEXT: ptrue p0.h, vl128
; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: mov z0.h, h0
; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_256-NEXT: mov x10, #80 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov z0.h, h0
; VBITS_GE_256-NEXT: mov x11, #64
; VBITS_GE_256-NEXT: mov x12, #48
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: mov x8, #32
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x11, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x12, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: splat_v128f16:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: // kill: def $h0 killed $h0 def $z0
; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048-NEXT: mov z0.h, h0
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
%insert = insertelement <128 x half> undef, half %a, i64 0 %insert = insertelement <128 x half> undef, half %a, i64 0
%splat = shufflevector <128 x half> %insert, <128 x half> undef, <128 x i32> zeroinitializer %splat = shufflevector <128 x half> %insert, <128 x half> undef, <128 x i32> zeroinitializer
store <128 x half> %splat, <128 x half>* %b store <128 x half> %splat, <128 x half>* %b
@ -632,7 +436,7 @@ define void @splat_v128f16(half %a, <128 x half>* %b) #0 {
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) #0 { define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v2f32: ; CHECK-LABEL: splat_v2f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
@ -644,7 +448,7 @@ define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) #0 {
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) #0 { define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v4f32: ; CHECK-LABEL: splat_v4f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
@ -655,7 +459,7 @@ define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) #0 {
ret <4 x float> %splat ret <4 x float> %splat
} }
define void @splat_v8f32(float %a, <8 x float>* %b) #0 { define void @splat_v8f32(float %a, <8 x float>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v8f32: ; CHECK-LABEL: splat_v8f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
@ -687,72 +491,34 @@ define void @splat_v16f32(float %a, <16 x float>* %b) #0 {
; VBITS_GE_512-NEXT: mov z0.s, s0 ; VBITS_GE_512-NEXT: mov z0.s, s0
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret ; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation.
%insert = insertelement <16 x float> undef, float %a, i64 0 %insert = insertelement <16 x float> undef, float %a, i64 0
%splat = shufflevector <16 x float> %insert, <16 x float> undef, <16 x i32> zeroinitializer %splat = shufflevector <16 x float> %insert, <16 x float> undef, <16 x i32> zeroinitializer
store <16 x float> %splat, <16 x float>* %b store <16 x float> %splat, <16 x float>* %b
ret void ret void
} }
define void @splat_v32f32(float %a, <32 x float>* %b) #0 { define void @splat_v32f32(float %a, <32 x float>* %b) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: splat_v32f32: ; CHECK-LABEL: splat_v32f32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #24 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
; VBITS_GE_256-NEXT: mov x9, #16 ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_256-NEXT: mov x10, #8 ; CHECK-NEXT: mov z0.s, s0
; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 def $z0 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov z0.s, s0
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: splat_v32f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: // kill: def $s0 killed $s0 def $z0
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: mov z0.s, s0
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
%insert = insertelement <32 x float> undef, float %a, i64 0 %insert = insertelement <32 x float> undef, float %a, i64 0
%splat = shufflevector <32 x float> %insert, <32 x float> undef, <32 x i32> zeroinitializer %splat = shufflevector <32 x float> %insert, <32 x float> undef, <32 x i32> zeroinitializer
store <32 x float> %splat, <32 x float>* %b store <32 x float> %splat, <32 x float>* %b
ret void ret void
} }
define void @splat_v64f32(float %a, <64 x float>* %b) #0 { define void @splat_v64f32(float %a, <64 x float>* %b) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: splat_v64f32: ; CHECK-LABEL: splat_v64f32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #56 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
; VBITS_GE_256-NEXT: mov x9, #48 ; CHECK-NEXT: ptrue p0.s, vl64
; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 def $z0 ; CHECK-NEXT: mov z0.s, s0
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_256-NEXT: mov x10, #40 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov z0.s, s0
; VBITS_GE_256-NEXT: mov x11, #32
; VBITS_GE_256-NEXT: mov x12, #24
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: splat_v64f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: // kill: def $s0 killed $s0 def $z0
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: mov z0.s, s0
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
%insert = insertelement <64 x float> undef, float %a, i64 0 %insert = insertelement <64 x float> undef, float %a, i64 0
%splat = shufflevector <64 x float> %insert, <64 x float> undef, <64 x i32> zeroinitializer %splat = shufflevector <64 x float> %insert, <64 x float> undef, <64 x i32> zeroinitializer
store <64 x float> %splat, <64 x float>* %b store <64 x float> %splat, <64 x float>* %b
@ -760,7 +526,7 @@ define void @splat_v64f32(float %a, <64 x float>* %b) #0 {
} }
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) #0 { define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v1f64: ; CHECK-LABEL: splat_v1f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ret ; CHECK-NEXT: ret
@ -770,7 +536,7 @@ define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) #0 {
} }
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) #0 { define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v2f64: ; CHECK-LABEL: splat_v2f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
@ -781,7 +547,7 @@ define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) #0 {
ret <2 x double> %splat ret <2 x double> %splat
} }
define void @splat_v4f64(double %a, <4 x double>* %b) #0 { define void @splat_v4f64(double %a, <4 x double>* %b) vscale_range(2,0) #0 {
; CHECK-LABEL: splat_v4f64: ; CHECK-LABEL: splat_v4f64:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@ -813,72 +579,34 @@ define void @splat_v8f64(double %a, <8 x double>* %b) #0 {
; VBITS_GE_512-NEXT: mov z0.d, d0 ; VBITS_GE_512-NEXT: mov z0.d, d0
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret ; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation.
%insert = insertelement <8 x double> undef, double %a, i64 0 %insert = insertelement <8 x double> undef, double %a, i64 0
%splat = shufflevector <8 x double> %insert, <8 x double> undef, <8 x i32> zeroinitializer %splat = shufflevector <8 x double> %insert, <8 x double> undef, <8 x i32> zeroinitializer
store <8 x double> %splat, <8 x double>* %b store <8 x double> %splat, <8 x double>* %b
ret void ret void
} }
define void @splat_v16f64(double %a, <16 x double>* %b) #0 { define void @splat_v16f64(double %a, <16 x double>* %b) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: splat_v16f64: ; CHECK-LABEL: splat_v16f64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #12 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; VBITS_GE_256-NEXT: mov x9, #8 ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_256-NEXT: mov x10, #4 ; CHECK-NEXT: mov z0.d, d0
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov z0.d, d0
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: splat_v16f64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: // kill: def $d0 killed $d0 def $z0
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: mov z0.d, d0
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
%insert = insertelement <16 x double> undef, double %a, i64 0 %insert = insertelement <16 x double> undef, double %a, i64 0
%splat = shufflevector <16 x double> %insert, <16 x double> undef, <16 x i32> zeroinitializer %splat = shufflevector <16 x double> %insert, <16 x double> undef, <16 x i32> zeroinitializer
store <16 x double> %splat, <16 x double>* %b store <16 x double> %splat, <16 x double>* %b
ret void ret void
} }
define void @splat_v32f64(double %a, <32 x double>* %b) #0 { define void @splat_v32f64(double %a, <32 x double>* %b) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: splat_v32f64: ; CHECK-LABEL: splat_v32f64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #28 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; VBITS_GE_256-NEXT: mov x9, #24 ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov z0.d, d0
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_256-NEXT: mov x10, #20 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov z0.d, d0
; VBITS_GE_256-NEXT: mov x11, #16
; VBITS_GE_256-NEXT: mov x12, #12
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: mov x9, #4
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x12, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: splat_v32f64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: // kill: def $d0 killed $d0 def $z0
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: mov z0.d, d0
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
%insert = insertelement <32 x double> undef, double %a, i64 0 %insert = insertelement <32 x double> undef, double %a, i64 0
%splat = shufflevector <32 x double> %insert, <32 x double> undef, <32 x i32> zeroinitializer %splat = shufflevector <32 x double> %insert, <32 x double> undef, <32 x i32> zeroinitializer
store <32 x double> %splat, <32 x double>* %b store <32 x double> %splat, <32 x double>* %b
@ -889,88 +617,52 @@ define void @splat_v32f64(double %a, <32 x double>* %b) #0 {
; DUP (integer immediate) ; DUP (integer immediate)
; ;
define void @splat_imm_v64i8(<64 x i8>* %a) #0 { define void @splat_imm_v64i8(<64 x i8>* %a) vscale_range(4,0) #0 {
; VBITS_GE_256-LABEL: splat_imm_v64i8: ; CHECK-LABEL: splat_imm_v64i8:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov w8, #32 ; CHECK-NEXT: mov z0.b, #1 // =0x1
; VBITS_GE_256-NEXT: mov z0.b, #1 // =0x1 ; CHECK-NEXT: ptrue p0.b, vl64
; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: splat_imm_v64i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov z0.b, #1 // =0x1
; VBITS_GE_512-NEXT: ptrue p0.b, vl64
; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%insert = insertelement <64 x i8> undef, i8 1, i64 0 %insert = insertelement <64 x i8> undef, i8 1, i64 0
%splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
store <64 x i8> %splat, <64 x i8>* %a store <64 x i8> %splat, <64 x i8>* %a
ret void ret void
} }
define void @splat_imm_v32i16(<32 x i16>* %a) #0 { define void @splat_imm_v32i16(<32 x i16>* %a) vscale_range(4,0) #0 {
; VBITS_GE_256-LABEL: splat_imm_v32i16: ; CHECK-LABEL: splat_imm_v32i16:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16 ; CHECK-NEXT: mov z0.h, #2 // =0x2
; VBITS_GE_256-NEXT: mov z0.h, #2 // =0x2 ; CHECK-NEXT: ptrue p0.h, vl32
; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: splat_imm_v32i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov z0.h, #2 // =0x2
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%insert = insertelement <32 x i16> undef, i16 2, i64 0 %insert = insertelement <32 x i16> undef, i16 2, i64 0
%splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
store <32 x i16> %splat, <32 x i16>* %a store <32 x i16> %splat, <32 x i16>* %a
ret void ret void
} }
define void @splat_imm_v16i32(<16 x i32>* %a) #0 { define void @splat_imm_v16i32(<16 x i32>* %a) vscale_range(4,0) #0 {
; VBITS_GE_256-LABEL: splat_imm_v16i32: ; CHECK-LABEL: splat_imm_v16i32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8 ; CHECK-NEXT: mov z0.s, #3 // =0x3
; VBITS_GE_256-NEXT: mov z0.s, #3 // =0x3 ; CHECK-NEXT: ptrue p0.s, vl16
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: splat_imm_v16i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov z0.s, #3 // =0x3
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%insert = insertelement <16 x i32> undef, i32 3, i64 0 %insert = insertelement <16 x i32> undef, i32 3, i64 0
%splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
store <16 x i32> %splat, <16 x i32>* %a store <16 x i32> %splat, <16 x i32>* %a
ret void ret void
} }
define void @splat_imm_v8i64(<8 x i64>* %a) #0 { define void @splat_imm_v8i64(<8 x i64>* %a) vscale_range(4,0) #0 {
; VBITS_GE_256-LABEL: splat_imm_v8i64: ; CHECK-LABEL: splat_imm_v8i64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4 ; CHECK-NEXT: mov z0.d, #4 // =0x4
; VBITS_GE_256-NEXT: mov z0.d, #4 // =0x4 ; CHECK-NEXT: ptrue p0.d, vl8
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: splat_imm_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov z0.d, #4 // =0x4
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%insert = insertelement <8 x i64> undef, i64 4, i64 0 %insert = insertelement <8 x i64> undef, i64 4, i64 0
%splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
store <8 x i64> %splat, <8 x i64>* %a store <8 x i64> %splat, <8 x i64>* %a
@ -981,69 +673,43 @@ define void @splat_imm_v8i64(<8 x i64>* %a) #0 {
; DUP (floating-point immediate) ; DUP (floating-point immediate)
; ;
define void @splat_imm_v32f16(<32 x half>* %a) #0 { define void @splat_imm_v32f16(<32 x half>* %a) vscale_range(4,0) #0 {
; VBITS_GE_256-LABEL: splat_imm_v32f16: ; CHECK-LABEL: splat_imm_v32f16:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16 ; CHECK-NEXT: fmov z0.h, #5.00000000
; VBITS_GE_256-NEXT: fmov z0.h, #5.00000000 ; CHECK-NEXT: ptrue p0.h, vl32
; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: splat_imm_v32f16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: fmov z0.h, #5.00000000
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%insert = insertelement <32 x half> undef, half 5.0, i64 0 %insert = insertelement <32 x half> undef, half 5.0, i64 0
%splat = shufflevector <32 x half> %insert, <32 x half> undef, <32 x i32> zeroinitializer %splat = shufflevector <32 x half> %insert, <32 x half> undef, <32 x i32> zeroinitializer
store <32 x half> %splat, <32 x half>* %a store <32 x half> %splat, <32 x half>* %a
ret void ret void
} }
define void @splat_imm_v16f32(<16 x float>* %a) #0 { define void @splat_imm_v16f32(<16 x float>* %a) vscale_range(4,0) #0 {
; VBITS_GE_256-LABEL: splat_imm_v16f32: ; CHECK-LABEL: splat_imm_v16f32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8 ; CHECK-NEXT: fmov z0.s, #6.00000000
; VBITS_GE_256-NEXT: fmov z0.s, #6.00000000 ; CHECK-NEXT: ptrue p0.s, vl16
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: splat_imm_v16f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: fmov z0.s, #6.00000000
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%insert = insertelement <16 x float> undef, float 6.0, i64 0 %insert = insertelement <16 x float> undef, float 6.0, i64 0
%splat = shufflevector <16 x float> %insert, <16 x float> undef, <16 x i32> zeroinitializer %splat = shufflevector <16 x float> %insert, <16 x float> undef, <16 x i32> zeroinitializer
store <16 x float> %splat, <16 x float>* %a store <16 x float> %splat, <16 x float>* %a
ret void ret void
} }
define void @splat_imm_v8f64(<8 x double>* %a) #0 { define void @splat_imm_v8f64(<8 x double>* %a) vscale_range(4,0) #0 {
; VBITS_GE_256-LABEL: splat_imm_v8f64: ; CHECK-LABEL: splat_imm_v8f64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4 ; CHECK-NEXT: fmov z0.d, #7.00000000
; VBITS_GE_256-NEXT: fmov z0.d, #7.00000000 ; CHECK-NEXT: ptrue p0.d, vl8
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: splat_imm_v8f64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: fmov z0.d, #7.00000000
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%insert = insertelement <8 x double> undef, double 7.0, i64 0 %insert = insertelement <8 x double> undef, double 7.0, i64 0
%splat = shufflevector <8 x double> %insert, <8 x double> undef, <8 x i32> zeroinitializer %splat = shufflevector <8 x double> %insert, <8 x double> undef, <8 x i32> zeroinitializer
store <8 x double> %splat, <8 x double>* %a store <8 x double> %splat, <8 x double>* %a
ret void ret void
} }
attributes #0 = { "target-features"="+sve" } attributes #0 = { "target-features"="+sve" }

View File

@ -1,35 +1,29 @@
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256 ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512,VBITS_LE_256 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_LE_1024,VBITS_LE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 ; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 ; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 ; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 ; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 ; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 ; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 ; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_LE_1024 ; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK ; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048
; VBYTES represents the useful byte size of a vector register from the code
; generator's point of view. It is clamped to power-of-2 values because
; only power-of-2 vector lengths are considered legal, regardless of the
; user specified vector length.
target triple = "aarch64-unknown-linux-gnu" target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE when its registers are no bigger than NEON.
; NO_SVE-NOT: ptrue
; Don't use SVE for 64-bit vectors. ; Don't use SVE for 64-bit vectors.
define void @store_v2f32(<2 x float>* %a) #0 { define void @store_v2f32(<2 x float>* %a) #0 {
; CHECK-LABEL: store_v2f32: ; CHECK-LABEL: store_v2f32:
; CHECK: str xzr, [x0] ; CHECK: // %bb.0:
; CHECK: ret ; CHECK-NEXT: str xzr, [x0]
; CHECK-NEXT: ret
store <2 x float> zeroinitializer, <2 x float>* %a store <2 x float> zeroinitializer, <2 x float>* %a
ret void ret void
} }
@ -37,66 +31,148 @@ define void @store_v2f32(<2 x float>* %a) #0 {
; Don't use SVE for 128-bit vectors. ; Don't use SVE for 128-bit vectors.
define void @store_v4f32(<4 x float>* %a) #0 { define void @store_v4f32(<4 x float>* %a) #0 {
; CHECK-LABEL: store_v4f32: ; CHECK-LABEL: store_v4f32:
; CHECK: stp xzr, xzr, [x0] ; CHECK: // %bb.0:
; CHECK: ret ; CHECK-NEXT: stp xzr, xzr, [x0]
; CHECK-NEXT: ret
store <4 x float> zeroinitializer, <4 x float>* %a store <4 x float> zeroinitializer, <4 x float>* %a
ret void ret void
} }
define void @store_v8f32(<8 x float>* %a) #0 { define void @store_v8f32(<8 x float>* %a) #0 {
; CHECK-LABEL: store_v8f32: ; CHECK-LABEL: store_v8f32:
; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]] ; CHECK: // %bb.0:
; CHECK: st1w { z{{[0-9]+}}.s }, [[PG]], [x0] ; CHECK-NEXT: ptrue p0.s, vl8
; CHECK: ret ; CHECK-NEXT: mov z0.s, #0 // =0x0
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
store <8 x float> zeroinitializer, <8 x float>* %a store <8 x float> zeroinitializer, <8 x float>* %a
ret void ret void
} }
define void @store_v16f32(<16 x float>* %a) #0 { define void @store_v16f32(<16 x float>* %a) #0 {
; CHECK-LABEL: store_v16f32: ; VBITS_GE_256-LABEL: store_v16f32:
; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]] ; VBITS_GE_256: // %bb.0:
; CHECK-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0] ; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_LE_256-DAG: mov x[[A1:[0-9]+]], #[[#div(VBYTES,4)]] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A1]], lsl #2] ; VBITS_GE_256-NEXT: mov z0.s, #0 // =0x0
; CHECK: ret ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: store_v16f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: mov z0.s, #0 // =0x0
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
;
; VBITS_GE_1024-LABEL: store_v16f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
; VBITS_GE_1024-NEXT: mov z0.s, #0 // =0x0
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: store_v16f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl16
; VBITS_GE_2048-NEXT: mov z0.s, #0 // =0x0
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
store <16 x float> zeroinitializer, <16 x float>* %a store <16 x float> zeroinitializer, <16 x float>* %a
ret void ret void
} }
define void @store_v32f32(<32 x float>* %a) #0 { define void @store_v32f32(<32 x float>* %a) #0 {
; CHECK-LABEL: store_v32f32: ; VBITS_GE_256-LABEL: store_v32f32:
; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]] ; VBITS_GE_256: // %bb.0:
; CHECK-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0] ; VBITS_GE_256-NEXT: mov x8, #24
; VBITS_LE_512-DAG: mov x[[A1:[0-9]+]], #[[#div(VBYTES,4)]] ; VBITS_GE_256-NEXT: mov x9, #16
; VBITS_LE_512-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A1]], lsl #2] ; VBITS_GE_256-NEXT: mov x10, #8
; VBITS_LE_256-DAG: mov x[[A2:[0-9]+]], #[[#mul(div(VBYTES,4),2)]] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A2]], lsl #2] ; VBITS_GE_256-NEXT: mov z0.s, #0 // =0x0
; VBITS_LE_256-DAG: mov x[[A3:[0-9]+]], #[[#mul(div(VBYTES,4),3)]] ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A3]], lsl #2] ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
; CHECK: ret ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: store_v32f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov x8, #16
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: mov z0.s, #0 // =0x0
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_512-NEXT: ret
;
; VBITS_GE_1024-LABEL: store_v32f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: mov z0.s, #0 // =0x0
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: store_v32f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
; VBITS_GE_2048-NEXT: mov z0.s, #0 // =0x0
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
store <32 x float> zeroinitializer, <32 x float>* %a store <32 x float> zeroinitializer, <32 x float>* %a
ret void ret void
} }
define void @store_v64f32(<64 x float>* %a) #0 { define void @store_v64f32(<64 x float>* %a) #0 {
; CHECK-LABEL: store_v64f32: ; VBITS_GE_256-LABEL: store_v64f32:
; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]] ; VBITS_GE_256: // %bb.0:
; CHECK-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0] ; VBITS_GE_256-NEXT: mov x8, #56
; VBITS_LE_1024-DAG: mov x[[A1:[0-9]+]], #[[#div(VBYTES,4)]] ; VBITS_GE_256-NEXT: mov x9, #48
; VBITS_LE_1024-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A1]], lsl #2] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_LE_512-DAG: mov x[[A2:[0-9]+]], #[[#mul(div(VBYTES,4),2)]] ; VBITS_GE_256-NEXT: mov z0.s, #0 // =0x0
; VBITS_LE_512-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A2]], lsl #2] ; VBITS_GE_256-NEXT: mov x10, #40
; VBITS_LE_512-DAG: mov x[[A3:[0-9]+]], #[[#mul(div(VBYTES,4),3)]] ; VBITS_GE_256-NEXT: mov x11, #32
; VBITS_LE_512-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A3]], lsl #2] ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_LE_256-DAG: mov x[[A4:[0-9]+]], #[[#mul(div(VBYTES,4),4)]] ; VBITS_GE_256-NEXT: mov x8, #24
; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A4]], lsl #2] ; VBITS_GE_256-NEXT: mov x12, #16
; VBITS_LE_256-DAG: mov x[[A5:[0-9]+]], #[[#mul(div(VBYTES,4),5)]] ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A5]], lsl #2] ; VBITS_GE_256-NEXT: mov x9, #8
; VBITS_LE_256-DAG: mov x[[A6:[0-9]+]], #[[#mul(div(VBYTES,4),6)]] ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2]
; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A6]], lsl #2] ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x11, lsl #2]
; VBITS_LE_256-DAG: mov x[[A7:[0-9]+]], #[[#mul(div(VBYTES,4),7)]] ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_LE_256-DAG: st1w { z{{[0-9]+}}.s }, [[PG]], [x0, x[[A7]], lsl #2] ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x12, lsl #2]
; CHECK: ret ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: store_v64f32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: mov x8, #48
; VBITS_GE_512-NEXT: mov x9, #32
; VBITS_GE_512-NEXT: mov x10, #16
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: mov z0.s, #0 // =0x0
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2]
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2]
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
;
; VBITS_GE_1024-LABEL: store_v64f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: mov x8, #32
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: mov z0.s, #0 // =0x0
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_1024-NEXT: ret
;
; VBITS_GE_2048-LABEL: store_v64f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: mov z0.s, #0 // =0x0
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_2048-NEXT: ret
store <64 x float> zeroinitializer, <64 x float>* %a store <64 x float> zeroinitializer, <64 x float>* %a
ret void ret void
} }

View File

@ -1,21 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; Don't use SVE when its registers are no bigger than NEON.
; RUN: llc -aarch64-sve-vector-bits-min=128 -aarch64-enable-atomic-cfg-tidy=false < %s | not grep ptrue
; RUN: llc -aarch64-sve-vector-bits-min=256 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=256 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=512 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=2048 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
; Test we can code generater patterns of the form: ; Test we can code generater patterns of the form:
; fixed_length_vector = ISD::EXTRACT_SUBVECTOR scalable_vector, 0 ; fixed_length_vector = ISD::EXTRACT_SUBVECTOR scalable_vector, 0
@ -28,7 +14,7 @@
target triple = "aarch64-unknown-linux-gnu" target triple = "aarch64-unknown-linux-gnu"
define void @subvector_v8i16(<8 x i16> *%in, <8 x i16>* %out) #0 { define void @subvector_v8i16(<8 x i16> *%in, <8 x i16>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: subvector_v8i16: ; CHECK-LABEL: subvector_v8i16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q0, [x0]
@ -42,7 +28,7 @@ bb1:
ret void ret void
} }
define void @subvector_v16i16(<16 x i16> *%in, <16 x i16>* %out) #0 { define void @subvector_v16i16(<16 x i16> *%in, <16 x i16>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: subvector_v16i16: ; CHECK-LABEL: subvector_v16i16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -82,29 +68,13 @@ bb1:
ret void ret void
} }
define void @subvector_v64i16(<64 x i16> *%in, <64 x i16>* %out) #0 { define void @subvector_v64i16(<64 x i16> *%in, <64 x i16>* %out) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: subvector_v64i16: ; CHECK-LABEL: subvector_v64i16:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #48 ; CHECK-NEXT: ptrue p0.h, vl64
; VBITS_GE_256-NEXT: mov x9, #32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #16 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: subvector_v64i16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%a = load <64 x i16>, <64 x i16>* %in %a = load <64 x i16>, <64 x i16>* %in
br label %bb1 br label %bb1
@ -113,7 +83,7 @@ bb1:
ret void ret void
} }
define void @subvector_v8i32(<8 x i32> *%in, <8 x i32>* %out) #0 { define void @subvector_v8i32(<8 x i32> *%in, <8 x i32>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: subvector_v8i32: ; CHECK-LABEL: subvector_v8i32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ptrue p0.s, vl8
@ -153,29 +123,13 @@ bb1:
ret void ret void
} }
define void @subvector_v32i32(<32 x i32> *%in, <32 x i32>* %out) #0 { define void @subvector_v32i32(<32 x i32> *%in, <32 x i32>* %out) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: subvector_v32i32: ; CHECK-LABEL: subvector_v32i32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #24 ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_256-NEXT: mov x9, #16 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #8 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: subvector_v32i32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%a = load <32 x i32>, <32 x i32>* %in %a = load <32 x i32>, <32 x i32>* %in
br label %bb1 br label %bb1
@ -184,41 +138,13 @@ bb1:
ret void ret void
} }
define void @subvector_v64i32(<64 x i32> *%in, <64 x i32>* %out) #0 { define void @subvector_v64i32(<64 x i32> *%in, <64 x i32>* %out) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: subvector_v64i32: ; CHECK-LABEL: subvector_v64i32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #56 ; CHECK-NEXT: ptrue p0.s, vl64
; VBITS_GE_256-NEXT: mov x9, #48 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #40 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: mov x11, #32 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov x12, #24
; VBITS_GE_256-NEXT: mov x13, #16
; VBITS_GE_256-NEXT: mov x14, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x13, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0, x14, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x13, lsl #2]
; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1, x14, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: subvector_v64i32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%a = load <64 x i32>, <64 x i32>* %in %a = load <64 x i32>, <64 x i32>* %in
br label %bb1 br label %bb1
@ -228,23 +154,16 @@ bb1:
} }
define void @subvector_v8i64(<8 x i64> *%in, <8 x i64>* %out) #0 { define void @subvector_v8i64(<8 x i64> *%in, <8 x i64>* %out) vscale_range(2,0) #0 {
; VBITS_GE_256-LABEL: subvector_v8i64: ; CHECK-LABEL: subvector_v8i64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4 ; CHECK-NEXT: mov x8, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] ; CHECK-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] ; CHECK-NEXT: st1d { z1.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret ; CHECK-NEXT: ret
;
; VBITS_GE_512-LABEL: subvector_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%a = load <8 x i64>, <8 x i64>* %in %a = load <8 x i64>, <8 x i64>* %in
br label %bb1 br label %bb1
@ -253,29 +172,13 @@ bb1:
ret void ret void
} }
define void @subvector_v16i64(<16 x i64> *%in, <16 x i64>* %out) #0 { define void @subvector_v16i64(<16 x i64> *%in, <16 x i64>* %out) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: subvector_v16i64: ; CHECK-LABEL: subvector_v16i64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #12 ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_256-NEXT: mov x9, #8 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #4 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: subvector_v16i64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%a = load <16 x i64>, <16 x i64>* %in %a = load <16 x i64>, <16 x i64>* %in
br label %bb1 br label %bb1
@ -284,41 +187,13 @@ bb1:
ret void ret void
} }
define void @subvector_v32i64(<32 x i64> *%in, <32 x i64>* %out) #0 { define void @subvector_v32i64(<32 x i64> *%in, <32 x i64>* %out) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: subvector_v32i64: ; CHECK-LABEL: subvector_v32i64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #28 ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_256-NEXT: mov x9, #24 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #20 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: mov x11, #16 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov x12, #12
; VBITS_GE_256-NEXT: mov x13, #8
; VBITS_GE_256-NEXT: mov x14, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x11, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x13, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0, x14, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x13, lsl #3]
; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x12, lsl #3]
; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1, x14, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: subvector_v32i64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%a = load <32 x i64>, <32 x i64>* %in %a = load <32 x i64>, <32 x i64>* %in
br label %bb1 br label %bb1
@ -327,7 +202,7 @@ bb1:
ret void ret void
} }
define void @subvector_v8f16(<8 x half> *%in, <8 x half>* %out) #0 { define void @subvector_v8f16(<8 x half> *%in, <8 x half>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: subvector_v8f16: ; CHECK-LABEL: subvector_v8f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q0, [x0]
@ -341,7 +216,7 @@ bb1:
ret void ret void
} }
define void @subvector_v16f16(<16 x half> *%in, <16 x half>* %out) #0 { define void @subvector_v16f16(<16 x half> *%in, <16 x half>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: subvector_v16f16: ; CHECK-LABEL: subvector_v16f16:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p0.h, vl16
@ -381,29 +256,13 @@ bb1:
ret void ret void
} }
define void @subvector_v64f16(<64 x half> *%in, <64 x half>* %out) #0 { define void @subvector_v64f16(<64 x half> *%in, <64 x half>* %out) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: subvector_v64f16: ; CHECK-LABEL: subvector_v64f16:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #48 ; CHECK-NEXT: ptrue p0.h, vl64
; VBITS_GE_256-NEXT: mov x9, #32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #16 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x9, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x10, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1]
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x10, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: subvector_v64f16:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%a = load <64 x half>, <64 x half>* %in %a = load <64 x half>, <64 x half>* %in
br label %bb1 br label %bb1
@ -412,7 +271,7 @@ bb1:
ret void ret void
} }
define void @subvector_v8f32(<8 x float> *%in, <8 x float>* %out) #0 { define void @subvector_v8f32(<8 x float> *%in, <8 x float>* %out) vscale_range(2,0) #0 {
; CHECK-LABEL: subvector_v8f32: ; CHECK-LABEL: subvector_v8f32:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ptrue p0.s, vl8
@ -452,29 +311,13 @@ bb1:
ret void ret void
} }
define void @subvector_v32f32(<32 x float> *%in, <32 x float>* %out) #0 { define void @subvector_v32f32(<32 x float> *%in, <32 x float>* %out) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: subvector_v32f32: ; CHECK-LABEL: subvector_v32f32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #24 ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_256-NEXT: mov x9, #16 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #8 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: subvector_v32f32:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%a = load <32 x float>, <32 x float>* %in %a = load <32 x float>, <32 x float>* %in
br label %bb1 br label %bb1
@ -483,41 +326,13 @@ bb1:
ret void ret void
} }
define void @subvector_v64f32(<64 x float> *%in, <64 x float>* %out) #0 { define void @subvector_v64f32(<64 x float> *%in, <64 x float>* %out) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: subvector_v64f32: ; CHECK-LABEL: subvector_v64f32:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #56 ; CHECK-NEXT: ptrue p0.s, vl64
; VBITS_GE_256-NEXT: mov x9, #48 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #40 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: mov x11, #32 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov x12, #24
; VBITS_GE_256-NEXT: mov x13, #16
; VBITS_GE_256-NEXT: mov x14, #8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x11, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x12, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x13, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0, x14, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2]
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x11, lsl #2]
; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2]
; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x13, lsl #2]
; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x12, lsl #2]
; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1, x14, lsl #2]
; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: subvector_v64f32:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%a = load <64 x float>, <64 x float>* %in %a = load <64 x float>, <64 x float>* %in
br label %bb1 br label %bb1
@ -550,29 +365,13 @@ bb1:
ret void ret void
} }
define void @subvector_v16f64(<16 x double> *%in, <16 x double>* %out) #0 { define void @subvector_v16f64(<16 x double> *%in, <16 x double>* %out) vscale_range(8,0) #0 {
; VBITS_GE_256-LABEL: subvector_v16f64: ; CHECK-LABEL: subvector_v16f64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #12 ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_256-NEXT: mov x9, #8 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #4 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_1024-LABEL: subvector_v16f64:
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%a = load <16 x double>, <16 x double>* %in %a = load <16 x double>, <16 x double>* %in
br label %bb1 br label %bb1
@ -581,41 +380,13 @@ bb1:
ret void ret void
} }
define void @subvector_v32f64(<32 x double> *%in, <32 x double>* %out) #0 { define void @subvector_v32f64(<32 x double> *%in, <32 x double>* %out) vscale_range(16,0) #0 {
; VBITS_GE_256-LABEL: subvector_v32f64: ; CHECK-LABEL: subvector_v32f64:
; VBITS_GE_256: // %bb.0: ; CHECK: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #28 ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_256-NEXT: mov x9, #24 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: mov x10, #20 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_256-NEXT: mov x11, #16 ; CHECK-NEXT: ret
; VBITS_GE_256-NEXT: mov x12, #12
; VBITS_GE_256-NEXT: mov x13, #8
; VBITS_GE_256-NEXT: mov x14, #4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x11, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x13, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0, x14, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3]
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x11, lsl #3]
; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3]
; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x13, lsl #3]
; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x12, lsl #3]
; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1, x14, lsl #3]
; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_2048-LABEL: subvector_v32f64:
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%a = load <32 x double>, <32 x double>* %in %a = load <32 x double>, <32 x double>* %in
br label %bb1 br label %bb1

View File

@ -1,43 +1,30 @@
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu" target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE when its registers are no bigger than NEON. define void @store_trunc_v2i64i8(<2 x i64>* %ap, <2 x i8>* %dest) vscale_range(2,0) #0 {
; NO_SVE-NOT: ptrue ; CHECK-LABEL: store_trunc_v2i64i8:
; CHECK: // %bb.0:
define void @store_trunc_v2i64i8(<2 x i64>* %ap, <2 x i8>* %dest) #0 { ; CHECK-NEXT: ldr q0, [x0]
; CHECK-LABEL: store_trunc_v2i64i8 ; CHECK-NEXT: ptrue p0.d, vl2
; CHECK: ldr q[[Q0:[0-9]+]], [x0] ; CHECK-NEXT: st1b { z0.d }, p0, [x1]
; CHECK: ptrue p[[P0:[0-9]+]].d, vl2 ; CHECK-NEXT: ret
; CHECK-NEXT: st1b { z[[Q0]].d }, p[[P0]], [x1]
; CHECK-NEXT: ret
%a = load <2 x i64>, <2 x i64>* %ap %a = load <2 x i64>, <2 x i64>* %ap
%val = trunc <2 x i64> %a to <2 x i8> %val = trunc <2 x i64> %a to <2 x i8>
store <2 x i8> %val, <2 x i8>* %dest store <2 x i8> %val, <2 x i8>* %dest
ret void ret void
} }
define void @store_trunc_v4i64i8(<4 x i64>* %ap, <4 x i8>* %dest) #0 { define void @store_trunc_v4i64i8(<4 x i64>* %ap, <4 x i8>* %dest) vscale_range(2,0) #0 {
; CHECK-LABEL: store_trunc_v4i64i8 ; CHECK-LABEL: store_trunc_v4i64i8:
; CHECK: ptrue p[[P0:[0-9]+]].d, vl4 ; CHECK: // %bb.0:
; CHECK-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: st1b { z[[Q0]].d }, p[[P0]], [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ret ; CHECK-NEXT: st1b { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%a = load <4 x i64>, <4 x i64>* %ap %a = load <4 x i64>, <4 x i64>* %ap
%val = trunc <4 x i64> %a to <4 x i8> %val = trunc <4 x i64> %a to <4 x i8>
store <4 x i8> %val, <4 x i8>* %dest store <4 x i8> %val, <4 x i8>* %dest
@ -45,48 +32,52 @@ define void @store_trunc_v4i64i8(<4 x i64>* %ap, <4 x i8>* %dest) #0 {
} }
define void @store_trunc_v8i64i8(<8 x i64>* %ap, <8 x i8>* %dest) #0 { define void @store_trunc_v8i64i8(<8 x i64>* %ap, <8 x i8>* %dest) #0 {
; CHECK-LABEL: store_trunc_v8i64i8: ; VBITS_GE_256-LABEL: store_trunc_v8i64i8:
; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_512-NEXT: st1b { [[Z0]].d }, p[[P0]], [x1] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; Ensure sensible type legalisation ; VBITS_GE_256-NEXT: ptrue p0.s, vl4
; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
; VBITS_EQ_256-DAG: ld1d { [[DWORDS_LO:z[0-9]+]].d }, [[PG1]]/z, [x0] ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s
; VBITS_EQ_256-DAG: ld1d { [[DWORDS_HI:z[0-9]+]].d }, [[PG1]]/z, [x0, x[[NUMELTS]], lsl #3] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl4 ; VBITS_GE_256-NEXT: st1b { z1.s }, p0, [x1]
; VBITS_EQ_256-DAG: uzp1 [[WORDS_LO:z[0-9]+]].s, [[DWORDS_LO]].s, [[DWORDS_LO]].s ; VBITS_GE_256-NEXT: ret
; VBITS_EQ_256-DAG: uzp1 [[WORDS_HI:z[0-9]+]].s, [[DWORDS_HI]].s, [[DWORDS_HI]].s ;
; VBITS_EQ_256-DAG: splice [[WORDS:z[0-9]+]].s, [[PG2]], [[WORDS_LO]].s, [[WORDS_HI]].s ; VBITS_GE_512-LABEL: store_trunc_v8i64i8:
; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].s, vl8 ; VBITS_GE_512: // %bb.0:
; VBITS_EQ_256-NEXT: st1b { [[WORDS]].s }, [[PG3]], [x1] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_EQ_256-NEXT: ret ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1b { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%a = load <8 x i64>, <8 x i64>* %ap %a = load <8 x i64>, <8 x i64>* %ap
%val = trunc <8 x i64> %a to <8 x i8> %val = trunc <8 x i64> %a to <8 x i8>
store <8 x i8> %val, <8 x i8>* %dest store <8 x i8> %val, <8 x i8>* %dest
ret void ret void
} }
define void @store_trunc_v16i64i8(<16 x i64>* %ap, <16 x i8>* %dest) #0 { define void @store_trunc_v16i64i8(<16 x i64>* %ap, <16 x i8>* %dest) vscale_range(8,0) #0 {
; CHECK-LABEL: store_trunc_v16i64i8: ; CHECK-LABEL: store_trunc_v16i64i8:
; VBITS_GE_1024: ptrue p[[P0:[0-9]+]].d, vl16 ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: st1b { [[Z0]].d }, p[[P0]], [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: st1b { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%a = load <16 x i64>, <16 x i64>* %ap %a = load <16 x i64>, <16 x i64>* %ap
%val = trunc <16 x i64> %a to <16 x i8> %val = trunc <16 x i64> %a to <16 x i8>
store <16 x i8> %val, <16 x i8>* %dest store <16 x i8> %val, <16 x i8>* %dest
ret void ret void
} }
define void @store_trunc_v32i64i8(<32 x i64>* %ap, <32 x i8>* %dest) #0 { define void @store_trunc_v32i64i8(<32 x i64>* %ap, <32 x i8>* %dest) vscale_range(16,0) #0 {
; CHECK-LABEL: store_trunc_v32i64i8: ; CHECK-LABEL: store_trunc_v32i64i8:
; VBITS_GE_2048: ptrue p[[P0:[0-9]+]].d, vl32 ; CHECK: // %bb.0:
; VBITS_GE_2048-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: st1b { [[Z0]].d }, p[[P0]], [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ret ; CHECK-NEXT: st1b { z0.d }, p0, [x1]
; CHECK-NEXT: ret
%a = load <32 x i64>, <32 x i64>* %ap %a = load <32 x i64>, <32 x i64>* %ap
%val = trunc <32 x i64> %a to <32 x i8> %val = trunc <32 x i64> %a to <32 x i8>
store <32 x i8> %val, <32 x i8>* %dest store <32 x i8> %val, <32 x i8>* %dest
@ -94,25 +85,27 @@ define void @store_trunc_v32i64i8(<32 x i64>* %ap, <32 x i8>* %dest) #0 {
} }
define void @store_trunc_v8i64i16(<8 x i64>* %ap, <8 x i16>* %dest) #0 { define void @store_trunc_v8i64i16(<8 x i64>* %ap, <8 x i16>* %dest) #0 {
; CHECK-LABEL: store_trunc_v8i64i16:
; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1h { [[Z0]].d }, p[[P0]], [x1]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation.
; Currently does not use the truncating store ; Currently does not use the truncating store
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 ; VBITS_GE_256-LABEL: store_trunc_v8i64i16:
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 ; VBITS_GE_256: // %bb.0:
; VBITS_EQ_256-DAG: ld1d { [[DWORDS_LO:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_EQ_256-DAG: ld1d { [[DWORDS_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_EQ_256-DAG: uzp1 [[WORDS_LO:z[0-9]+]].s, [[DWORDS_LO]].s, [[DWORDS_LO]].s ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_EQ_256-DAG: uzp1 [[WORDS_HI:z[0-9]+]].s, [[DWORDS_HI]].s, [[DWORDS_HI]].s ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_EQ_256-DAG: uzp1 z[[HALFS_LO:[0-9]+]].h, [[WORDS_LO]].h, [[WORDS_LO]].h ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_EQ_256-DAG: uzp1 z[[HALFS_HI:[0-9]+]].h, [[WORDS_HI]].h, [[WORDS_HI]].h ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
; VBITS_EQ_256-NEXT: mov v[[HALFS_LO]].d[1], v[[HALFS_HI]].d[0] ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_EQ_256-NEXT: str q[[HALFS_LO]], [x1] ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
; VBITS_EQ_256-NEXT: ret ; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
; VBITS_GE_256-NEXT: str q1, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: store_trunc_v8i64i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1h { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%a = load <8 x i64>, <8 x i64>* %ap %a = load <8 x i64>, <8 x i64>* %ap
%val = trunc <8 x i64> %a to <8 x i16> %val = trunc <8 x i64> %a to <8 x i16>
store <8 x i16> %val, <8 x i16>* %dest store <8 x i16> %val, <8 x i16>* %dest
@ -120,24 +113,26 @@ define void @store_trunc_v8i64i16(<8 x i64>* %ap, <8 x i16>* %dest) #0 {
} }
define void @store_trunc_v8i64i32(<8 x i64>* %ap, <8 x i32>* %dest) #0 { define void @store_trunc_v8i64i32(<8 x i64>* %ap, <8 x i32>* %dest) #0 {
; CHECK-LABEL: store_trunc_v8i64i32: ; VBITS_GE_256-LABEL: store_trunc_v8i64i32:
; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_512-NEXT: st1w { [[Z0]].d }, p[[P0]], [x1] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; Ensure sensible type legalisation ; VBITS_GE_256-NEXT: ptrue p0.s, vl4
; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
; VBITS_EQ_256-DAG: ld1d { [[DWORDS_LO:z[0-9]+]].d }, [[PG1]]/z, [x0] ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s
; VBITS_EQ_256-DAG: ld1d { [[DWORDS_HI:z[0-9]+]].d }, [[PG1]]/z, [x0, x[[NUMELTS]], lsl #3] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl4 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
; VBITS_EQ_256-DAG: uzp1 [[WORDS_LO:z[0-9]+]].s, [[DWORDS_LO]].s, [[DWORDS_LO]].s ; VBITS_GE_256-NEXT: ret
; VBITS_EQ_256-DAG: uzp1 [[WORDS_HI:z[0-9]+]].s, [[DWORDS_HI]].s, [[DWORDS_HI]].s ;
; VBITS_EQ_256-DAG: splice [[WORDS:z[0-9]+]].s, [[PG1]], [[WORDS_LO]].s, [[WORDS_HI]].s ; VBITS_GE_512-LABEL: store_trunc_v8i64i32:
; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].s, vl8 ; VBITS_GE_512: // %bb.0:
; VBITS_EQ_256-NEXT: st1w { [[WORDS]].s }, [[PG3]], [x1] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_EQ_256-NEXT: ret ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%a = load <8 x i64>, <8 x i64>* %ap %a = load <8 x i64>, <8 x i64>* %ap
%val = trunc <8 x i64> %a to <8 x i32> %val = trunc <8 x i64> %a to <8 x i32>
store <8 x i32> %val, <8 x i32>* %dest store <8 x i32> %val, <8 x i32>* %dest
@ -145,25 +140,27 @@ define void @store_trunc_v8i64i32(<8 x i64>* %ap, <8 x i32>* %dest) #0 {
} }
define void @store_trunc_v16i32i8(<16 x i32>* %ap, <16 x i8>* %dest) #0 { define void @store_trunc_v16i32i8(<16 x i32>* %ap, <16 x i8>* %dest) #0 {
; CHECK-LABEL: store_trunc_v16i32i8:
; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16
; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1b { [[Z0]].s }, p[[P0]], [x1]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalisation.
; Currently does not use the truncating store ; Currently does not use the truncating store
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 ; VBITS_GE_256-LABEL: store_trunc_v16i32i8:
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8 ; VBITS_GE_256: // %bb.0:
; VBITS_EQ_256-DAG: ld1w { [[WORDS_LO:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_EQ_256-DAG: ld1w { [[WORDS_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_EQ_256-DAG: uzp1 [[HALFS_LO:z[0-9]+]].h, [[WORDS_LO]].h, [[WORDS_LO]].h ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_EQ_256-DAG: uzp1 [[HALFS_HI:z[0-9]+]].h, [[WORDS_HI]].h, [[WORDS_HI]].h ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_EQ_256-DAG: uzp1 z[[BYTES_LO:[0-9]+]].b, [[HALFS_LO]].b, [[HALFS_LO]].b ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_EQ_256-DAG: uzp1 z[[BYTES_HI:[0-9]+]].b, [[HALFS_HI]].b, [[HALFS_HI]].b ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
; VBITS_EQ_256-NEXT: mov v[[BYTES_LO]].d[1], v[[BYTES_HI]].d[0] ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
; VBITS_EQ_256-NEXT: str q[[BYTES_LO]], [x1] ; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
; VBITS_EQ_256-NEXT: ret ; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
; VBITS_GE_256-NEXT: str q1, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: store_trunc_v16i32i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1b { z0.s }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%a = load <16 x i32>, <16 x i32>* %ap %a = load <16 x i32>, <16 x i32>* %ap
%val = trunc <16 x i32> %a to <16 x i8> %val = trunc <16 x i32> %a to <16 x i8>
store <16 x i8> %val, <16 x i8>* %dest store <16 x i8> %val, <16 x i8>* %dest
@ -171,24 +168,26 @@ define void @store_trunc_v16i32i8(<16 x i32>* %ap, <16 x i8>* %dest) #0 {
} }
define void @store_trunc_v16i32i16(<16 x i32>* %ap, <16 x i16>* %dest) #0 { define void @store_trunc_v16i32i16(<16 x i32>* %ap, <16 x i16>* %dest) #0 {
; CHECK-LABEL: store_trunc_v16i32i16: ; VBITS_GE_256-LABEL: store_trunc_v16i32i16:
; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_512-NEXT: st1h { [[Z0]].s }, p[[P0]], [x1] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; Ensure sensible type legalisation ; VBITS_GE_256-NEXT: ptrue p0.h, vl8
; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8 ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8 ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
; VBITS_EQ_256-DAG: ld1w { [[WORDS_LO:z[0-9]+]].s }, [[PG1]]/z, [x0] ; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h
; VBITS_EQ_256-DAG: ld1w { [[WORDS_HI:z[0-9]+]].s }, [[PG1]]/z, [x0, x[[NUMELTS]], lsl #2] ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].h, vl8 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
; VBITS_EQ_256-DAG: uzp1 [[HALFS_LO:z[0-9]+]].h, [[WORDS_LO]].h, [[WORDS_LO]].h ; VBITS_GE_256-NEXT: ret
; VBITS_EQ_256-DAG: uzp1 [[HALFS_HI:z[0-9]+]].h, [[WORDS_HI]].h, [[WORDS_HI]].h ;
; VBITS_EQ_256-DAG: splice [[HALFS:z[0-9]+]].h, [[PG2]], [[HALFS_LO]].h, [[HALFS_HI]].h ; VBITS_GE_512-LABEL: store_trunc_v16i32i16:
; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].h, vl16 ; VBITS_GE_512: // %bb.0:
; VBITS_EQ_256-NEXT: st1h { [[HALFS]].h }, [[PG3]], [x1] ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_EQ_256-NEXT: ret ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%a = load <16 x i32>, <16 x i32>* %ap %a = load <16 x i32>, <16 x i32>* %ap
%val = trunc <16 x i32> %a to <16 x i16> %val = trunc <16 x i32> %a to <16 x i16>
store <16 x i16> %val, <16 x i16>* %dest store <16 x i16> %val, <16 x i16>* %dest
@ -196,24 +195,26 @@ define void @store_trunc_v16i32i16(<16 x i32>* %ap, <16 x i16>* %dest) #0 {
} }
define void @store_trunc_v32i16i8(<32 x i16>* %ap, <32 x i8>* %dest) #0 { define void @store_trunc_v32i16i8(<32 x i16>* %ap, <32 x i8>* %dest) #0 {
; CHECK-LABEL: store_trunc_v32i16i8: ; VBITS_GE_256-LABEL: store_trunc_v32i16i8:
; VBITS_GE_512: ptrue p[[P0:[0-9]+]].h, vl32 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_512-NEXT: st1b { [[Z0]].h }, p[[P0]], [x1] ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; Ensure sensible type legalisation ; VBITS_GE_256-NEXT: ptrue p0.b, vl16
; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].h, vl16 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16 ; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
; VBITS_EQ_256-DAG: ld1h { [[HALFS_LO:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_GE_256-NEXT: splice z1.b, p0, z1.b, z0.b
; VBITS_EQ_256-DAG: ld1h { [[HALFS_HI:z[0-9]+]].h }, [[PG1]]/z, [x0, x[[NUMELTS]], lsl #1] ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].b, vl16 ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x1]
; VBITS_EQ_256-DAG: uzp1 [[BYTES_LO:z[0-9]+]].b, [[HALFS_LO]].b, [[HALFS_LO]].b ; VBITS_GE_256-NEXT: ret
; VBITS_EQ_256-DAG: uzp1 [[BYTES_HI:z[0-9]+]].b, [[HALFS_HI]].b, [[HALFS_HI]].b ;
; VBITS_EQ_256-DAG: splice [[BYTES:z[0-9]+]].b, [[PG2]], [[BYTES_LO]].b, [[BYTES_HI]].b ; VBITS_GE_512-LABEL: store_trunc_v32i16i8:
; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].b, vl32 ; VBITS_GE_512: // %bb.0:
; VBITS_EQ_256-NEXT: st1b { [[BYTES]].b }, [[PG3]], [x1] ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_EQ_256-NEXT: ret ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: st1b { z0.h }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%a = load <32 x i16>, <32 x i16>* %ap %a = load <32 x i16>, <32 x i16>* %ap
%val = trunc <32 x i16> %a to <32 x i8> %val = trunc <32 x i16> %a to <32 x i8>
store <32 x i8> %val, <32 x i8>* %dest store <32 x i8> %val, <32 x i8>* %dest

View File

@ -1,35 +1,22 @@
; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
target triple = "aarch64-unknown-linux-gnu" target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE when its registers are no bigger than NEON.
; NO_SVE-NOT: z{0-9}
; ;
; truncate i16 -> i8 ; truncate i16 -> i8
; ;
define <16 x i8> @trunc_v16i16_v16i8(<16 x i16>* %in) #0 { define <16 x i8> @trunc_v16i16_v16i8(<16 x i16>* %in) vscale_range(2,0) #0 {
; CHECK-LABEL: trunc_v16i16_v16i8: ; CHECK-LABEL: trunc_v16i16_v16i8:
; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 ; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { [[A_HALFS:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ret ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%a = load <16 x i16>, <16 x i16>* %in %a = load <16 x i16>, <16 x i16>* %in
%b = trunc <16 x i16> %a to <16 x i8> %b = trunc <16 x i16> %a to <16 x i8>
ret <16 x i8> %b ret <16 x i8> %b
@ -37,11 +24,30 @@ define <16 x i8> @trunc_v16i16_v16i8(<16 x i16>* %in) #0 {
; NOTE: Extra 'add' is to prevent the truncate being combined with the store. ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v32i16_v32i8(<32 x i16>* %in, <32 x i8>* %out) #0 { define void @trunc_v32i16_v32i8(<32 x i16>* %in, <32 x i8>* %out) #0 {
; CHECK-LABEL: trunc_v32i16_v32i8: ; VBITS_GE_256-LABEL: trunc_v32i16_v32i8:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512: ld1h { [[A_HALFS:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #16
; VBITS_GE_512: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_512: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.b, vl16
; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
; VBITS_GE_256-NEXT: splice z1.b, p0, z1.b, z0.b
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: add z0.b, z1.b, z1.b
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: trunc_v32i16_v32i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ptrue p0.b, vl32
; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b
; VBITS_GE_512-NEXT: add z0.b, z0.b, z0.b
; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%a = load <32 x i16>, <32 x i16>* %in %a = load <32 x i16>, <32 x i16>* %in
%b = trunc <32 x i16> %a to <32 x i8> %b = trunc <32 x i16> %a to <32 x i8>
%c = add <32 x i8> %b, %b %c = add <32 x i8> %b, %b
@ -50,12 +56,16 @@ define void @trunc_v32i16_v32i8(<32 x i16>* %in, <32 x i8>* %out) #0 {
} }
; NOTE: Extra 'add' is to prevent the truncate being combined with the store. ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v64i16_v64i8(<64 x i16>* %in, <64 x i8>* %out) #0 { define void @trunc_v64i16_v64i8(<64 x i16>* %in, <64 x i8>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: trunc_v64i16_v64i8: ; CHECK-LABEL: trunc_v64i16_v64i8:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 ; CHECK: // %bb.0:
; VBITS_GE_1024: ld1h { [[A_HALFS:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl64
; VBITS_GE_1024: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b ; CHECK-NEXT: ptrue p0.b, vl64
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: add z0.b, z0.b, z0.b
; CHECK-NEXT: st1b { z0.b }, p0, [x1]
; CHECK-NEXT: ret
%a = load <64 x i16>, <64 x i16>* %in %a = load <64 x i16>, <64 x i16>* %in
%b = trunc <64 x i16> %a to <64 x i8> %b = trunc <64 x i16> %a to <64 x i8>
%c = add <64 x i8> %b, %b %c = add <64 x i8> %b, %b
@ -64,12 +74,16 @@ define void @trunc_v64i16_v64i8(<64 x i16>* %in, <64 x i8>* %out) #0 {
} }
; NOTE: Extra 'add' is to prevent the truncate being combined with the store. ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v128i16_v128i8(<128 x i16>* %in, <128 x i8>* %out) #0 { define void @trunc_v128i16_v128i8(<128 x i16>* %in, <128 x i8>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: trunc_v128i16_v128i8: ; CHECK-LABEL: trunc_v128i16_v128i8:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 ; CHECK: // %bb.0:
; VBITS_GE_2048: ld1h { [[A_HALFS:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.h, vl128
; VBITS_GE_2048: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b ; CHECK-NEXT: ptrue p0.b, vl128
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: add z0.b, z0.b, z0.b
; CHECK-NEXT: st1b { z0.b }, p0, [x1]
; CHECK-NEXT: ret
%a = load <128 x i16>, <128 x i16>* %in %a = load <128 x i16>, <128 x i16>* %in
%b = trunc <128 x i16> %a to <128 x i8> %b = trunc <128 x i16> %a to <128 x i8>
%c = add <128 x i8> %b, %b %c = add <128 x i8> %b, %b
@ -81,38 +95,60 @@ define void @trunc_v128i16_v128i8(<128 x i16>* %in, <128 x i8>* %out) #0 {
; truncate i32 -> i8 ; truncate i32 -> i8
; ;
define <8 x i8> @trunc_v8i32_v8i8(<8 x i32>* %in) #0 { define <8 x i8> @trunc_v8i32_v8i8(<8 x i32>* %in) vscale_range(2,0) #0 {
; CHECK-LABEL: trunc_v8i32_v8i8: ; CHECK-LABEL: trunc_v8i32_v8i8:
; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 ; CHECK: // %bb.0:
; CHECK-NEXT: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: ret ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%a = load <8 x i32>, <8 x i32>* %in %a = load <8 x i32>, <8 x i32>* %in
%b = trunc <8 x i32> %a to <8 x i8> %b = trunc <8 x i32> %a to <8 x i8>
ret <8 x i8> %b ret <8 x i8> %b
} }
define <16 x i8> @trunc_v16i32_v16i8(<16 x i32>* %in) #0 { define <16 x i8> @trunc_v16i32_v16i8(<16 x i32>* %in) #0 {
; CHECK-LABEL: trunc_v16i32_v16i8: ; VBITS_GE_256-LABEL: trunc_v16i32_v16i8:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-NEXT: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_512-NEXT: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_512-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
; VBITS_GE_256-NEXT: uzp1 z2.b, z0.b, z0.b
; VBITS_GE_256-NEXT: uzp1 z0.b, z1.b, z1.b
; VBITS_GE_256-NEXT: mov v0.d[1], v2.d[0]
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: trunc_v16i32_v16i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b
; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_512-NEXT: ret
%a = load <16 x i32>, <16 x i32>* %in %a = load <16 x i32>, <16 x i32>* %in
%b = trunc <16 x i32> %a to <16 x i8> %b = trunc <16 x i32> %a to <16 x i8>
ret <16 x i8> %b ret <16 x i8> %b
} }
; NOTE: Extra 'add' is to prevent the truncate being combined with the store. ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v32i32_v32i8(<32 x i32>* %in, <32 x i8>* %out) #0 { define void @trunc_v32i32_v32i8(<32 x i32>* %in, <32 x i8>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: trunc_v32i32_v32i8: ; CHECK-LABEL: trunc_v32i32_v32i8:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 ; CHECK: // %bb.0:
; VBITS_GE_1024: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b ; CHECK-NEXT: ptrue p0.b, vl32
; VBITS_GE_1024: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: add z0.b, z0.b, z0.b
; CHECK-NEXT: st1b { z0.b }, p0, [x1]
; CHECK-NEXT: ret
%a = load <32 x i32>, <32 x i32>* %in %a = load <32 x i32>, <32 x i32>* %in
%b = trunc <32 x i32> %a to <32 x i8> %b = trunc <32 x i32> %a to <32 x i8>
%c = add <32 x i8> %b, %b %c = add <32 x i8> %b, %b
@ -121,13 +157,17 @@ define void @trunc_v32i32_v32i8(<32 x i32>* %in, <32 x i8>* %out) #0 {
} }
; NOTE: Extra 'add' is to prevent the truncate being combined with the store. ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v64i32_v64i8(<64 x i32>* %in, <64 x i8>* %out) #0 { define void @trunc_v64i32_v64i8(<64 x i32>* %in, <64 x i8>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: trunc_v64i32_v64i8: ; CHECK-LABEL: trunc_v64i32_v64i8:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 ; CHECK: // %bb.0:
; VBITS_GE_2048: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b ; CHECK-NEXT: ptrue p0.b, vl64
; VBITS_GE_2048: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: add z0.b, z0.b, z0.b
; CHECK-NEXT: st1b { z0.b }, p0, [x1]
; CHECK-NEXT: ret
%a = load <64 x i32>, <64 x i32>* %in %a = load <64 x i32>, <64 x i32>* %in
%b = trunc <64 x i32> %a to <64 x i8> %b = trunc <64 x i32> %a to <64 x i8>
%c = add <64 x i8> %b, %b %c = add <64 x i8> %b, %b
@ -139,12 +179,14 @@ define void @trunc_v64i32_v64i8(<64 x i32>* %in, <64 x i8>* %out) #0 {
; truncate i32 -> i16 ; truncate i32 -> i16
; ;
define <8 x i16> @trunc_v8i32_v8i16(<8 x i32>* %in) #0 { define <8 x i16> @trunc_v8i32_v8i16(<8 x i32>* %in) vscale_range(2,0) #0 {
; CHECK-LABEL: trunc_v8i32_v8i16: ; CHECK-LABEL: trunc_v8i32_v8i16:
; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 ; CHECK: // %bb.0:
; CHECK-NEXT: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: uzp1 z0.h, [[A_WORDS]].h, [[A_WORDS]].h ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ret ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%a = load <8 x i32>, <8 x i32>* %in %a = load <8 x i32>, <8 x i32>* %in
%b = trunc <8 x i32> %a to <8 x i16> %b = trunc <8 x i32> %a to <8 x i16>
ret <8 x i16> %b ret <8 x i16> %b
@ -152,11 +194,30 @@ define <8 x i16> @trunc_v8i32_v8i16(<8 x i32>* %in) #0 {
; NOTE: Extra 'add' is to prevent the truncate being combined with the store. ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v16i32_v16i16(<16 x i32>* %in, <16 x i16>* %out) #0 { define void @trunc_v16i32_v16i16(<16 x i32>* %in, <16 x i16>* %out) #0 {
; CHECK-LABEL: trunc_v16i32_v16i16: ; VBITS_GE_256-LABEL: trunc_v16i32_v16i16:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #8
; VBITS_GE_512: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_512: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.h, vl8
; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: add z0.h, z1.h, z1.h
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: trunc_v16i32_v16i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ptrue p0.h, vl16
; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_512-NEXT: add z0.h, z0.h, z0.h
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%a = load <16 x i32>, <16 x i32>* %in %a = load <16 x i32>, <16 x i32>* %in
%b = trunc <16 x i32> %a to <16 x i16> %b = trunc <16 x i32> %a to <16 x i16>
%c = add <16 x i16> %b, %b %c = add <16 x i16> %b, %b
@ -165,12 +226,16 @@ define void @trunc_v16i32_v16i16(<16 x i32>* %in, <16 x i16>* %out) #0 {
} }
; NOTE: Extra 'add' is to prevent the truncate being combined with the store. ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v32i32_v32i16(<32 x i32>* %in, <32 x i16>* %out) #0 { define void @trunc_v32i32_v32i16(<32 x i32>* %in, <32 x i16>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: trunc_v32i32_v32i16: ; CHECK-LABEL: trunc_v32i32_v32i16:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 ; CHECK: // %bb.0:
; VBITS_GE_1024: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h ; CHECK-NEXT: ptrue p0.h, vl32
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: add z0.h, z0.h, z0.h
; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; CHECK-NEXT: ret
%a = load <32 x i32>, <32 x i32>* %in %a = load <32 x i32>, <32 x i32>* %in
%b = trunc <32 x i32> %a to <32 x i16> %b = trunc <32 x i32> %a to <32 x i16>
%c = add <32 x i16> %b, %b %c = add <32 x i16> %b, %b
@ -179,12 +244,16 @@ define void @trunc_v32i32_v32i16(<32 x i32>* %in, <32 x i16>* %out) #0 {
} }
; NOTE: Extra 'add' is to prevent the truncate being combined with the store. ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v64i32_v64i16(<64 x i32>* %in, <64 x i16>* %out) #0 { define void @trunc_v64i32_v64i16(<64 x i32>* %in, <64 x i16>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: trunc_v64i32_v64i16: ; CHECK-LABEL: trunc_v64i32_v64i16:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 ; CHECK: // %bb.0:
; VBITS_GE_2048: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h ; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: add z0.h, z0.h, z0.h
; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; CHECK-NEXT: ret
%a = load <64 x i32>, <64 x i32>* %in %a = load <64 x i32>, <64 x i32>* %in
%b = trunc <64 x i32> %a to <64 x i16> %b = trunc <64 x i32> %a to <64 x i16>
%c = add <64 x i16> %b, %b %c = add <64 x i16> %b, %b
@ -197,53 +266,78 @@ define void @trunc_v64i32_v64i16(<64 x i32>* %in, <64 x i16>* %out) #0 {
; ;
; NOTE: v4i8 is not legal so result i8 elements are held within i16 containers. ; NOTE: v4i8 is not legal so result i8 elements are held within i16 containers.
define <4 x i8> @trunc_v4i64_v4i8(<4 x i64>* %in) #0 { define <4 x i8> @trunc_v4i64_v4i8(<4 x i64>* %in) vscale_range(2,0) #0 {
; CHECK-LABEL: trunc_v4i64_v4i8: ; CHECK-LABEL: trunc_v4i64_v4i8:
; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 ; CHECK: // %bb.0:
; CHECK-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: uzp1 z0.h, [[A_WORDS]].h, [[A_WORDS]].h ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: ret ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%a = load <4 x i64>, <4 x i64>* %in %a = load <4 x i64>, <4 x i64>* %in
%b = trunc <4 x i64> %a to <4 x i8> %b = trunc <4 x i64> %a to <4 x i8>
ret <4 x i8> %b ret <4 x i8> %b
} }
define <8 x i8> @trunc_v8i64_v8i8(<8 x i64>* %in) #0 { define <8 x i8> @trunc_v8i64_v8i8(<8 x i64>* %in) #0 {
; CHECK-LABEL: trunc_v8i64_v8i8: ; VBITS_GE_256-LABEL: trunc_v8i64_v8i8:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_512-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_512-NEXT: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_512-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: ptrue p0.s, vl4
; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s
; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: trunc_v8i64_v8i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b
; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0
; VBITS_GE_512-NEXT: ret
%a = load <8 x i64>, <8 x i64>* %in %a = load <8 x i64>, <8 x i64>* %in
%b = trunc <8 x i64> %a to <8 x i8> %b = trunc <8 x i64> %a to <8 x i8>
ret <8 x i8> %b ret <8 x i8> %b
} }
define <16 x i8> @trunc_v16i64_v16i8(<16 x i64>* %in) #0 { define <16 x i8> @trunc_v16i64_v16i8(<16 x i64>* %in) vscale_range(8,0) #0 {
; CHECK-LABEL: trunc_v16i64_v16i8: ; CHECK-LABEL: trunc_v16i64_v16i8:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 ; CHECK: // %bb.0:
; VBITS_GE_1024-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_1024-NEXT: uzp1 z0.b, [[A_HALFS]].b, [[A_HALFS]].b ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_1024-NEXT: ret ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%a = load <16 x i64>, <16 x i64>* %in %a = load <16 x i64>, <16 x i64>* %in
%b = trunc <16 x i64> %a to <16 x i8> %b = trunc <16 x i64> %a to <16 x i8>
ret <16 x i8> %b ret <16 x i8> %b
} }
; NOTE: Extra 'add' is to prevent the truncate being combined with the store. ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v32i64_v32i8(<32 x i64>* %in, <32 x i8>* %out) #0 { define void @trunc_v32i64_v32i8(<32 x i64>* %in, <32 x i8>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: trunc_v32i64_v32i8: ; CHECK-LABEL: trunc_v32i64_v32i8:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 ; CHECK: // %bb.0:
; VBITS_GE_2048: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h ; CHECK-NEXT: ptrue p0.b, vl32
; VBITS_GE_2048: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_2048: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: add z0.b, z0.b, z0.b
; CHECK-NEXT: st1b { z0.b }, p0, [x1]
; CHECK-NEXT: ret
%a = load <32 x i64>, <32 x i64>* %in %a = load <32 x i64>, <32 x i64>* %in
%b = trunc <32 x i64> %a to <32 x i8> %b = trunc <32 x i64> %a to <32 x i8>
%c = add <32 x i8> %b, %b %c = add <32 x i8> %b, %b
@ -255,38 +349,60 @@ define void @trunc_v32i64_v32i8(<32 x i64>* %in, <32 x i8>* %out) #0 {
; truncate i64 -> i16 ; truncate i64 -> i16
; ;
define <4 x i16> @trunc_v4i64_v4i16(<4 x i64>* %in) #0 { define <4 x i16> @trunc_v4i64_v4i16(<4 x i64>* %in) vscale_range(2,0) #0 {
; CHECK-LABEL: trunc_v4i64_v4i16: ; CHECK-LABEL: trunc_v4i64_v4i16:
; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 ; CHECK: // %bb.0:
; CHECK-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: uzp1 z0.h, [[A_WORDS]].h, [[A_WORDS]].h ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: ret ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%a = load <4 x i64>, <4 x i64>* %in %a = load <4 x i64>, <4 x i64>* %in
%b = trunc <4 x i64> %a to <4 x i16> %b = trunc <4 x i64> %a to <4 x i16>
ret <4 x i16> %b ret <4 x i16> %b
} }
define <8 x i16> @trunc_v8i64_v8i16(<8 x i64>* %in) #0 { define <8 x i16> @trunc_v8i64_v8i16(<8 x i64>* %in) #0 {
; CHECK-LABEL: trunc_v8i64_v8i16: ; VBITS_GE_256-LABEL: trunc_v8i64_v8i16:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_512-NEXT: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_512-NEXT: uzp1 z0.h, [[A_WORDS]].h, [[A_WORDS]].h ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_512-NEXT: ret ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
; VBITS_GE_256-NEXT: uzp1 z2.h, z0.h, z0.h
; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
; VBITS_GE_256-NEXT: mov v0.d[1], v2.d[0]
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: trunc_v8i64_v8i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_512-NEXT: ret
%a = load <8 x i64>, <8 x i64>* %in %a = load <8 x i64>, <8 x i64>* %in
%b = trunc <8 x i64> %a to <8 x i16> %b = trunc <8 x i64> %a to <8 x i16>
ret <8 x i16> %b ret <8 x i16> %b
} }
; NOTE: Extra 'add' is to prevent the truncate being combined with the store. ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v16i64_v16i16(<16 x i64>* %in, <16 x i16>* %out) #0 { define void @trunc_v16i64_v16i16(<16 x i64>* %in, <16 x i16>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: trunc_v16i64_v16i16: ; CHECK-LABEL: trunc_v16i64_v16i16:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 ; CHECK: // %bb.0:
; VBITS_GE_1024: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h ; CHECK-NEXT: ptrue p0.h, vl16
; VBITS_GE_1024: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: add z0.h, z0.h, z0.h
; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; CHECK-NEXT: ret
%a = load <16 x i64>, <16 x i64>* %in %a = load <16 x i64>, <16 x i64>* %in
%b = trunc <16 x i64> %a to <16 x i16> %b = trunc <16 x i64> %a to <16 x i16>
%c = add <16 x i16> %b, %b %c = add <16 x i16> %b, %b
@ -295,13 +411,17 @@ define void @trunc_v16i64_v16i16(<16 x i64>* %in, <16 x i16>* %out) #0 {
} }
; NOTE: Extra 'add' is to prevent the truncate being combined with the store. ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v32i64_v32i16(<32 x i64>* %in, <32 x i16>* %out) #0 { define void @trunc_v32i64_v32i16(<32 x i64>* %in, <32 x i16>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: trunc_v32i64_v32i16: ; CHECK-LABEL: trunc_v32i64_v32i16:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 ; CHECK: // %bb.0:
; VBITS_GE_2048: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h ; CHECK-NEXT: ptrue p0.h, vl32
; VBITS_GE_2048: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: add z0.h, z0.h, z0.h
; CHECK-NEXT: st1h { z0.h }, p0, [x1]
; CHECK-NEXT: ret
%a = load <32 x i64>, <32 x i64>* %in %a = load <32 x i64>, <32 x i64>* %in
%b = trunc <32 x i64> %a to <32 x i16> %b = trunc <32 x i64> %a to <32 x i16>
%c = add <32 x i16> %b, %b %c = add <32 x i16> %b, %b
@ -313,12 +433,14 @@ define void @trunc_v32i64_v32i16(<32 x i64>* %in, <32 x i16>* %out) #0 {
; truncate i64 -> i32 ; truncate i64 -> i32
; ;
define <4 x i32> @trunc_v4i64_v4i32(<4 x i64>* %in) #0 { define <4 x i32> @trunc_v4i64_v4i32(<4 x i64>* %in) vscale_range(2,0) #0 {
; CHECK-LABEL: trunc_v4i64_v4i32: ; CHECK-LABEL: trunc_v4i64_v4i32:
; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 ; CHECK: // %bb.0:
; CHECK-NEXT: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: uzp1 z0.s, [[A_DWORDS]].s, [[A_DWORDS]].s ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ret ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%a = load <4 x i64>, <4 x i64>* %in %a = load <4 x i64>, <4 x i64>* %in
%b = trunc <4 x i64> %a to <4 x i32> %b = trunc <4 x i64> %a to <4 x i32>
ret <4 x i32> %b ret <4 x i32> %b
@ -326,11 +448,30 @@ define <4 x i32> @trunc_v4i64_v4i32(<4 x i64>* %in) #0 {
; NOTE: Extra 'add' is to prevent the truncate being combined with the store. ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v8i64_v8i32(<8 x i64>* %in, <8 x i32>* %out) #0 { define void @trunc_v8i64_v8i32(<8 x i64>* %in, <8 x i32>* %out) #0 {
; CHECK-LABEL: trunc_v8i64_v8i32: ; VBITS_GE_256-LABEL: trunc_v8i64_v8i32:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 ; VBITS_GE_256: // %bb.0:
; VBITS_GE_512: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_256-NEXT: mov x8, #4
; VBITS_GE_512: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_512: add [[A_WORDS]].s, [[A_WORDS]].s, [[A_WORDS]].s ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.s, vl4
; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: add z0.s, z1.s, z1.s
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: trunc_v8i64_v8i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ptrue p0.s, vl8
; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
; VBITS_GE_512-NEXT: add z0.s, z0.s, z0.s
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%a = load <8 x i64>, <8 x i64>* %in %a = load <8 x i64>, <8 x i64>* %in
%b = trunc <8 x i64> %a to <8 x i32> %b = trunc <8 x i64> %a to <8 x i32>
%c = add <8 x i32> %b, %b %c = add <8 x i32> %b, %b
@ -339,12 +480,16 @@ define void @trunc_v8i64_v8i32(<8 x i64>* %in, <8 x i32>* %out) #0 {
} }
; NOTE: Extra 'add' is to prevent the truncate being combined with the store. ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v16i64_v16i32(<16 x i64>* %in, <16 x i32>* %out) #0 { define void @trunc_v16i64_v16i32(<16 x i64>* %in, <16 x i32>* %out) vscale_range(8,0) #0 {
; CHECK-LABEL: trunc_v16i64_v16i32: ; CHECK-LABEL: trunc_v16i64_v16i32:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 ; CHECK: // %bb.0:
; VBITS_GE_1024: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_1024: add [[A_WORDS]].s, [[A_WORDS]].s, [[A_WORDS]].s ; CHECK-NEXT: ptrue p0.s, vl16
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: add z0.s, z0.s, z0.s
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%a = load <16 x i64>, <16 x i64>* %in %a = load <16 x i64>, <16 x i64>* %in
%b = trunc <16 x i64> %a to <16 x i32> %b = trunc <16 x i64> %a to <16 x i32>
%c = add <16 x i32> %b, %b %c = add <16 x i32> %b, %b
@ -353,12 +498,16 @@ define void @trunc_v16i64_v16i32(<16 x i64>* %in, <16 x i32>* %out) #0 {
} }
; NOTE: Extra 'add' is to prevent the truncate being combined with the store. ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
define void @trunc_v32i64_v32i32(<32 x i64>* %in, <32 x i32>* %out) #0 { define void @trunc_v32i64_v32i32(<32 x i64>* %in, <32 x i32>* %out) vscale_range(16,0) #0 {
; CHECK-LABEL: trunc_v32i64_v32i32: ; CHECK-LABEL: trunc_v32i64_v32i32:
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 ; CHECK: // %bb.0:
; VBITS_GE_2048: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_2048: add [[A_WORDS]].s, [[A_WORDS]].s, [[A_WORDS]].s ; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
; CHECK-NEXT: add z0.s, z0.s, z0.s
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
%a = load <32 x i64>, <32 x i64>* %in %a = load <32 x i64>, <32 x i64>* %in
%b = trunc <32 x i64> %a to <32 x i32> %b = trunc <32 x i64> %a to <32 x i32>
%c = add <32 x i32> %b, %b %c = add <32 x i32> %b, %b

File diff suppressed because it is too large Load Diff