forked from OSchip/llvm-project
[llvm][sve] Lowering for VLS extending loads
This patch enables extending loads for fixed length SVE code generation. There is a slight regression here in the mulh tests; since these tests load the parameter and then extend it these are treated as extending loads which are merged, preventing the mulh instruction from being generated. As this affects scalable SVE codegen as well this should be addressed in a separate patch. Reviewed By: bsmith Differential Revision: https://reviews.llvm.org/D107057
This commit is contained in:
parent
39bbbc2c2a
commit
9c47d6b48d
|
@ -1516,6 +1516,8 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
|
||||||
MVT InnerVT = VT.changeVectorElementType(MVT::i8);
|
MVT InnerVT = VT.changeVectorElementType(MVT::i8);
|
||||||
while (InnerVT != VT) {
|
while (InnerVT != VT) {
|
||||||
setTruncStoreAction(VT, InnerVT, Custom);
|
setTruncStoreAction(VT, InnerVT, Custom);
|
||||||
|
setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
|
||||||
|
setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
|
||||||
InnerVT = InnerVT.changeVectorElementType(
|
InnerVT = InnerVT.changeVectorElementType(
|
||||||
MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
|
MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
|
||||||
}
|
}
|
||||||
|
@ -4176,7 +4178,9 @@ bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
|
bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
|
||||||
return ExtVal.getValueType().isScalableVector();
|
return ExtVal.getValueType().isScalableVector() ||
|
||||||
|
useSVEForFixedLengthVectorVT(ExtVal.getValueType(),
|
||||||
|
/*OverrideNEON=*/true);
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
|
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
|
||||||
|
|
|
@ -0,0 +1,225 @@
|
||||||
|
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
|
||||||
|
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256
|
||||||
|
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
|
||||||
|
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
|
||||||
|
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
|
||||||
|
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
|
||||||
|
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
|
||||||
|
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
|
||||||
|
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
|
||||||
|
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
|
||||||
|
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
|
||||||
|
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
|
||||||
|
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
|
||||||
|
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
|
||||||
|
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
|
||||||
|
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512
|
||||||
|
|
||||||
|
target triple = "aarch64-unknown-linux-gnu"
|
||||||
|
|
||||||
|
; Don't use SVE when its registers are no bigger than NEON.
|
||||||
|
; NO_SVE-NOT: ptrue
|
||||||
|
|
||||||
|
define <4 x i32> @load_zext_v4i16i32(<4 x i16>* %ap) #0 {
|
||||||
|
; CHECK-LABEL: load_zext_v4i16i32
|
||||||
|
; CHECK: ldr d[[D0:[0-9]+]], [x0]
|
||||||
|
; CHECK-NEXT: ushll v[[D0]].4s, v[[D0]].4h, #0
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
%a = load <4 x i16>, <4 x i16>* %ap
|
||||||
|
%val = zext <4 x i16> %a to <4 x i32>
|
||||||
|
ret <4 x i32> %val
|
||||||
|
}
|
||||||
|
|
||||||
|
define <8 x i32> @load_zext_v8i16i32(<8 x i16>* %ap) #0 {
|
||||||
|
; CHECK-LABEL: load_zext_v8i16i32
|
||||||
|
; CHECK: ptrue [[P0:p[0-9]+]].s, vl8
|
||||||
|
; CHECK-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
|
||||||
|
; CHECK-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
%a = load <8 x i16>, <8 x i16>* %ap
|
||||||
|
%val = zext <8 x i16> %a to <8 x i32>
|
||||||
|
ret <8 x i32> %val
|
||||||
|
}
|
||||||
|
|
||||||
|
define <16 x i32> @load_zext_v16i16i32(<16 x i16>* %ap) #0 {
|
||||||
|
; CHECK-LABEL: load_zext_v16i16i32
|
||||||
|
; VBITS_GE_512: ptrue [[P0:p[0-9]+]].s, vl16
|
||||||
|
; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
|
||||||
|
; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
|
||||||
|
; VBITS_GE_512-NEXT: ret
|
||||||
|
|
||||||
|
; Ensure sensible type legalistaion
|
||||||
|
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
|
||||||
|
; VBITS_EQ_256-DAG: ld1h { [[Z0:z[0-9]+]].h }, [[PG]]/z, [x0]
|
||||||
|
; VBITS_EQ_256-DAG: mov x9, sp
|
||||||
|
; VBITS_EQ_256-DAG: st1h { [[Z0]].h }, [[PG]], [x9]
|
||||||
|
; VBITS_EQ_256-DAG: ldp q[[R0:[0-9]+]], q[[R1:[0-9]+]], [sp]
|
||||||
|
; VBITS_EQ_256-DAG: add x9, x8, #32
|
||||||
|
; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8
|
||||||
|
; VBITS_EQ_256-DAG: uunpklo z[[R0]].s, z[[R0]].h
|
||||||
|
; VBITS_EQ_256-DAG: uunpklo z[[R1]].s, z[[R1]].h
|
||||||
|
; VBITS_EQ_256-DAG: st1w { z[[R1]].s }, [[PG1]], [x9]
|
||||||
|
; VBITS_EQ_256-DAG: st1w { z[[R0]].s }, [[PG1]], [x8]
|
||||||
|
; VBITS_EQ_256-DAG: ret
|
||||||
|
%a = load <16 x i16>, <16 x i16>* %ap
|
||||||
|
%val = zext <16 x i16> %a to <16 x i32>
|
||||||
|
ret <16 x i32> %val
|
||||||
|
}
|
||||||
|
|
||||||
|
define <32 x i32> @load_zext_v32i16i32(<32 x i16>* %ap) #0 {
|
||||||
|
; CHECK-LABEL: load_zext_v32i16i32
|
||||||
|
; VBITS_GE_1024: ptrue [[P0:p[0-9]+]].s, vl32
|
||||||
|
; VBITS_GE_1024-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
|
||||||
|
; VBITS_GE_1024-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
|
||||||
|
; VBITS_GE_1024-NEXT: ret
|
||||||
|
%a = load <32 x i16>, <32 x i16>* %ap
|
||||||
|
%val = zext <32 x i16> %a to <32 x i32>
|
||||||
|
ret <32 x i32> %val
|
||||||
|
}
|
||||||
|
|
||||||
|
define <64 x i32> @load_zext_v64i16i32(<64 x i16>* %ap) #0 {
|
||||||
|
; CHECK-LABEL: load_zext_v64i16i32
|
||||||
|
; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].s, vl64
|
||||||
|
; VBITS_GE_2048-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
|
||||||
|
; VBITS_GE_2048-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
|
||||||
|
; VBITS_GE_2048-NEXT: ret
|
||||||
|
%a = load <64 x i16>, <64 x i16>* %ap
|
||||||
|
%val = zext <64 x i16> %a to <64 x i32>
|
||||||
|
ret <64 x i32> %val
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x i32> @load_sext_v4i16i32(<4 x i16>* %ap) #0 {
|
||||||
|
; CHECK-LABEL: load_sext_v4i16i32
|
||||||
|
; CHECK: ldr d[[D0:[0-9]+]], [x0]
|
||||||
|
; CHECK-NEXT: sshll v[[D0]].4s, v[[D0]].4h, #0
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
%a = load <4 x i16>, <4 x i16>* %ap
|
||||||
|
%val = sext <4 x i16> %a to <4 x i32>
|
||||||
|
ret <4 x i32> %val
|
||||||
|
}
|
||||||
|
|
||||||
|
define <8 x i32> @load_sext_v8i16i32(<8 x i16>* %ap) #0 {
|
||||||
|
; CHECK-LABEL: load_sext_v8i16i32
|
||||||
|
; CHECK: ptrue [[P0:p[0-9]+]].s, vl8
|
||||||
|
; CHECK-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
|
||||||
|
; CHECK-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
%a = load <8 x i16>, <8 x i16>* %ap
|
||||||
|
%val = sext <8 x i16> %a to <8 x i32>
|
||||||
|
ret <8 x i32> %val
|
||||||
|
}
|
||||||
|
|
||||||
|
define <16 x i32> @load_sext_v16i16i32(<16 x i16>* %ap) #0 {
|
||||||
|
; CHECK-LABEL: load_sext_v16i16i32
|
||||||
|
; VBITS_GE_512: ptrue [[P0:p[0-9]+]].s, vl16
|
||||||
|
; VBITS_GE_512-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
|
||||||
|
; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
|
||||||
|
; VBITS_GE_512-NEXT: ret
|
||||||
|
|
||||||
|
; Ensure sensible type legalistaion
|
||||||
|
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
|
||||||
|
; VBITS_EQ_256-DAG: ld1h { [[Z0:z[0-9]+]].h }, [[PG]]/z, [x0]
|
||||||
|
; VBITS_EQ_256-DAG: mov x9, sp
|
||||||
|
; VBITS_EQ_256-DAG: st1h { [[Z0]].h }, [[PG]], [x9]
|
||||||
|
; VBITS_EQ_256-DAG: ldp q[[R0:[0-9]+]], q[[R1:[0-9]+]], [sp]
|
||||||
|
; VBITS_EQ_256-DAG: add x9, x8, #32
|
||||||
|
; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8
|
||||||
|
; VBITS_EQ_256-DAG: sunpklo z[[R0]].s, z[[R0]].h
|
||||||
|
; VBITS_EQ_256-DAG: sunpklo z[[R1]].s, z[[R1]].h
|
||||||
|
; VBITS_EQ_256-DAG: st1w { z[[R1]].s }, [[PG1]], [x9]
|
||||||
|
; VBITS_EQ_256-DAG: st1w { z[[R0]].s }, [[PG1]], [x8]
|
||||||
|
; VBITS_EQ_256-DAG: ret
|
||||||
|
%a = load <16 x i16>, <16 x i16>* %ap
|
||||||
|
%val = sext <16 x i16> %a to <16 x i32>
|
||||||
|
ret <16 x i32> %val
|
||||||
|
}
|
||||||
|
|
||||||
|
define <32 x i32> @load_sext_v32i16i32(<32 x i16>* %ap) #0 {
|
||||||
|
; CHECK-LABEL: load_sext_v32i16i32
|
||||||
|
; VBITS_GE_1024: ptrue [[P0:p[0-9]+]].s, vl32
|
||||||
|
; VBITS_GE_1024-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
|
||||||
|
; VBITS_GE_1024-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
|
||||||
|
; VBITS_GE_1024-NEXT: ret
|
||||||
|
%a = load <32 x i16>, <32 x i16>* %ap
|
||||||
|
%val = sext <32 x i16> %a to <32 x i32>
|
||||||
|
ret <32 x i32> %val
|
||||||
|
}
|
||||||
|
|
||||||
|
define <64 x i32> @load_sext_v64i16i32(<64 x i16>* %ap) #0 {
|
||||||
|
; CHECK-LABEL: load_sext_v64i16i32
|
||||||
|
; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].s, vl64
|
||||||
|
; VBITS_GE_2048-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
|
||||||
|
; VBITS_GE_2048-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
|
||||||
|
; VBITS_GE_2048-NEXT: ret
|
||||||
|
%a = load <64 x i16>, <64 x i16>* %ap
|
||||||
|
%val = sext <64 x i16> %a to <64 x i32>
|
||||||
|
ret <64 x i32> %val
|
||||||
|
}
|
||||||
|
|
||||||
|
define <32 x i64> @load_zext_v32i8i64(<32 x i8>* %ap) #0 {
|
||||||
|
; CHECK-LABEL: load_zext_v32i8i64
|
||||||
|
; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32
|
||||||
|
; VBITS_GE_2048-NEXT: ld1b { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0]
|
||||||
|
; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8]
|
||||||
|
; VBITS_GE_2048-NEXT: ret
|
||||||
|
%a = load <32 x i8>, <32 x i8>* %ap
|
||||||
|
%val = zext <32 x i8> %a to <32 x i64>
|
||||||
|
ret <32 x i64> %val
|
||||||
|
}
|
||||||
|
|
||||||
|
define <32 x i64> @load_sext_v32i8i64(<32 x i8>* %ap) #0 {
|
||||||
|
; CHECK-LABEL: load_sext_v32i8i64
|
||||||
|
; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32
|
||||||
|
; VBITS_GE_2048-NEXT: ld1sb { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0]
|
||||||
|
; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8]
|
||||||
|
; VBITS_GE_2048-NEXT: ret
|
||||||
|
%a = load <32 x i8>, <32 x i8>* %ap
|
||||||
|
%val = sext <32 x i8> %a to <32 x i64>
|
||||||
|
ret <32 x i64> %val
|
||||||
|
}
|
||||||
|
|
||||||
|
define <32 x i64> @load_zext_v32i16i64(<32 x i16>* %ap) #0 {
|
||||||
|
; CHECK-LABEL: load_zext_v32i16i64
|
||||||
|
; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32
|
||||||
|
; VBITS_GE_2048-NEXT: ld1h { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0]
|
||||||
|
; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8]
|
||||||
|
; VBITS_GE_2048-NEXT: ret
|
||||||
|
%a = load <32 x i16>, <32 x i16>* %ap
|
||||||
|
%val = zext <32 x i16> %a to <32 x i64>
|
||||||
|
ret <32 x i64> %val
|
||||||
|
}
|
||||||
|
|
||||||
|
define <32 x i64> @load_sext_v32i16i64(<32 x i16>* %ap) #0 {
|
||||||
|
; CHECK-LABEL: load_sext_v32i16i64
|
||||||
|
; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32
|
||||||
|
; VBITS_GE_2048-NEXT: ld1sh { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0]
|
||||||
|
; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8]
|
||||||
|
; VBITS_GE_2048-NEXT: ret
|
||||||
|
%a = load <32 x i16>, <32 x i16>* %ap
|
||||||
|
%val = sext <32 x i16> %a to <32 x i64>
|
||||||
|
ret <32 x i64> %val
|
||||||
|
}
|
||||||
|
|
||||||
|
define <32 x i64> @load_zext_v32i32i64(<32 x i32>* %ap) #0 {
|
||||||
|
; CHECK-LABEL: load_zext_v32i32i64
|
||||||
|
; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32
|
||||||
|
; VBITS_GE_2048-NEXT: ld1w { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0]
|
||||||
|
; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8]
|
||||||
|
; VBITS_GE_2048-NEXT: ret
|
||||||
|
%a = load <32 x i32>, <32 x i32>* %ap
|
||||||
|
%val = zext <32 x i32> %a to <32 x i64>
|
||||||
|
ret <32 x i64> %val
|
||||||
|
}
|
||||||
|
|
||||||
|
define <32 x i64> @load_sext_v32i32i64(<32 x i32>* %ap) #0 {
|
||||||
|
; CHECK-LABEL: load_sext_v32i32i64
|
||||||
|
; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32
|
||||||
|
; VBITS_GE_2048-NEXT: ld1sw { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0]
|
||||||
|
; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8]
|
||||||
|
; VBITS_GE_2048-NEXT: ret
|
||||||
|
%a = load <32 x i32>, <32 x i32>* %ap
|
||||||
|
%val = sext <32 x i32> %a to <32 x i64>
|
||||||
|
ret <32 x i64> %val
|
||||||
|
}
|
||||||
|
|
||||||
|
attributes #0 = { "target-features"="+sve" }
|
|
@ -1,11 +1,11 @@
|
||||||
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
|
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
|
||||||
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
|
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256,VBITS_EQ_256
|
||||||
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
|
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
|
||||||
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256
|
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256,VBITS_EQ_512
|
||||||
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256
|
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256
|
||||||
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256
|
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256
|
||||||
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256
|
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256
|
||||||
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
|
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256,VBITS_EQ_1024
|
||||||
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
|
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
|
||||||
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
|
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
|
||||||
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
|
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
|
||||||
|
@ -75,12 +75,19 @@ define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
|
||||||
|
|
||||||
define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
|
define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
|
||||||
; CHECK-LABEL: smulh_v32i8:
|
; CHECK-LABEL: smulh_v32i8:
|
||||||
; VBITS_GE_256: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
|
; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
|
||||||
; VBITS_GE_256-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
|
; VBITS_EQ_256-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
|
||||||
; VBITS_GE_256-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
|
; VBITS_EQ_256-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
|
||||||
; VBITS_GE_256: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
|
; VBITS_EQ_256: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
|
||||||
; VBITS_GE_256: st1b { [[RES]].b }, [[PG]], [x0]
|
; VBITS_EQ_256: ret
|
||||||
; VBITS_GE_256: ret
|
|
||||||
|
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]
|
||||||
|
; VBITS_GE_512-DAG: ld1sb { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
|
||||||
|
; VBITS_GE_512-DAG: ld1sb { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
|
||||||
|
; VBITS_GE_512: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
|
||||||
|
; VBITS_GE_512: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
|
||||||
|
; VBITS_GE_512: st1b { [[RES]].h }, [[PG]], [x0]
|
||||||
|
; VBITS_GE_512: ret
|
||||||
%op1 = load <32 x i8>, <32 x i8>* %a
|
%op1 = load <32 x i8>, <32 x i8>* %a
|
||||||
%op2 = load <32 x i8>, <32 x i8>* %b
|
%op2 = load <32 x i8>, <32 x i8>* %b
|
||||||
%insert = insertelement <32 x i16> undef, i16 8, i64 0
|
%insert = insertelement <32 x i16> undef, i16 8, i64 0
|
||||||
|
@ -96,12 +103,19 @@ define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
|
||||||
|
|
||||||
define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
|
define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
|
||||||
; CHECK-LABEL: smulh_v64i8:
|
; CHECK-LABEL: smulh_v64i8:
|
||||||
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
|
; VBITS_EQ_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
|
||||||
; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
|
; VBITS_EQ_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
|
||||||
; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
|
; VBITS_EQ_512: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
|
||||||
; VBITS_GE_512: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
|
; VBITS_EQ_512: st1b { [[RES]].b }, [[PG]], [x0]
|
||||||
; VBITS_GE_512: st1b { [[RES]].b }, [[PG]], [x0]
|
; VBITS_EQ_512: ret
|
||||||
; VBITS_GE_512: ret
|
|
||||||
|
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]]
|
||||||
|
; VBITS_GE_1024-DAG: ld1sb { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
|
||||||
|
; VBITS_GE_1024-DAG: ld1sb { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
|
||||||
|
; VBITS_GE_1024: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
|
||||||
|
; VBITS_GE_1024: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
|
||||||
|
; VBITS_GE_1024: st1b { [[RES]].h }, [[PG]], [x0]
|
||||||
|
; VBITS_GE_1024: ret
|
||||||
%op1 = load <64 x i8>, <64 x i8>* %a
|
%op1 = load <64 x i8>, <64 x i8>* %a
|
||||||
%op2 = load <64 x i8>, <64 x i8>* %b
|
%op2 = load <64 x i8>, <64 x i8>* %b
|
||||||
%insert = insertelement <64 x i16> undef, i16 8, i64 0
|
%insert = insertelement <64 x i16> undef, i16 8, i64 0
|
||||||
|
@ -117,12 +131,19 @@ define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
|
||||||
|
|
||||||
define void @smulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
|
define void @smulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
|
||||||
; CHECK-LABEL: smulh_v128i8:
|
; CHECK-LABEL: smulh_v128i8:
|
||||||
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
|
; VBITS_EQ_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
|
||||||
; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
|
; VBITS_EQ_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
|
||||||
; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
|
; VBITS_EQ_1024: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
|
||||||
; VBITS_GE_1024: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
|
; VBITS_EQ_1024: st1b { [[RES]].b }, [[PG]], [x0]
|
||||||
; VBITS_GE_1024: st1b { [[RES]].b }, [[PG]], [x0]
|
; VBITS_EQ_1024: ret
|
||||||
; VBITS_GE_1024: ret
|
|
||||||
|
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,128)]]
|
||||||
|
; VBITS_GE_2048-DAG: ld1sb { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
|
||||||
|
; VBITS_GE_2048-DAG: ld1sb { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
|
||||||
|
; VBITS_GE_2048: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
|
||||||
|
; VBITS_GE_2048: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
|
||||||
|
; VBITS_GE_2048: st1b { [[RES]].h }, [[PG]], [x0]
|
||||||
|
; VBITS_GE_2048: ret
|
||||||
%op1 = load <128 x i8>, <128 x i8>* %a
|
%op1 = load <128 x i8>, <128 x i8>* %a
|
||||||
%op2 = load <128 x i8>, <128 x i8>* %b
|
%op2 = load <128 x i8>, <128 x i8>* %b
|
||||||
%insert = insertelement <128 x i16> undef, i16 8, i64 0
|
%insert = insertelement <128 x i16> undef, i16 8, i64 0
|
||||||
|
@ -198,12 +219,20 @@ define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
|
||||||
|
|
||||||
define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
|
define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
|
||||||
; CHECK-LABEL: smulh_v16i16:
|
; CHECK-LABEL: smulh_v16i16:
|
||||||
; VBITS_GE_256: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,16)]]
|
; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,16)]]
|
||||||
; VBITS_GE_256-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
|
; VBITS_EQ_256-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
|
||||||
; VBITS_GE_256-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
|
; VBITS_EQ_256-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
|
||||||
; VBITS_GE_256: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
|
; VBITS_EQ_256: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
|
||||||
; VBITS_GE_256: st1h { [[RES]].h }, [[PG]], [x0]
|
; VBITS_EQ_256: st1h { [[RES]].h }, [[PG]], [x0]
|
||||||
; VBITS_GE_256: ret
|
; VBITS_EQ_256: ret
|
||||||
|
|
||||||
|
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]
|
||||||
|
; VBITS_GE_512-DAG: ld1sh { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
|
||||||
|
; VBITS_GE_512-DAG: ld1sh { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
|
||||||
|
; VBITS_GE_512: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
|
||||||
|
; VBITS_GE_512: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
|
||||||
|
; VBITS_GE_512: st1h { [[RES]].s }, [[PG]], [x0]
|
||||||
|
; VBITS_GE_512: ret
|
||||||
%op1 = load <16 x i16>, <16 x i16>* %a
|
%op1 = load <16 x i16>, <16 x i16>* %a
|
||||||
%op2 = load <16 x i16>, <16 x i16>* %b
|
%op2 = load <16 x i16>, <16 x i16>* %b
|
||||||
%insert = insertelement <16 x i32> undef, i32 16, i64 0
|
%insert = insertelement <16 x i32> undef, i32 16, i64 0
|
||||||
|
@ -219,12 +248,20 @@ define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
|
||||||
|
|
||||||
define void @smulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
|
define void @smulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
|
||||||
; CHECK-LABEL: smulh_v32i16:
|
; CHECK-LABEL: smulh_v32i16:
|
||||||
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]
|
; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]
|
||||||
; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
|
; VBITS_EQ_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
|
||||||
; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
|
; VBITS_EQ_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
|
||||||
; VBITS_GE_512: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
|
; VBITS_EQ_512: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
|
||||||
; VBITS_GE_512: st1h { [[RES]].h }, [[PG]], [x0]
|
; VBITS_EQ_512: st1h { [[RES]].h }, [[PG]], [x0]
|
||||||
; VBITS_GE_512: ret
|
; VBITS_EQ_512: ret
|
||||||
|
|
||||||
|
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]
|
||||||
|
; VBITS_GE_1024-DAG: ld1sh { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
|
||||||
|
; VBITS_GE_1024-DAG: ld1sh { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
|
||||||
|
; VBITS_GE_1024: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
|
||||||
|
; VBITS_GE_1024: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
|
||||||
|
; VBITS_GE_1024: st1h { [[RES]].s }, [[PG]], [x0]
|
||||||
|
; VBITS_GE_1024: ret
|
||||||
%op1 = load <32 x i16>, <32 x i16>* %a
|
%op1 = load <32 x i16>, <32 x i16>* %a
|
||||||
%op2 = load <32 x i16>, <32 x i16>* %b
|
%op2 = load <32 x i16>, <32 x i16>* %b
|
||||||
%insert = insertelement <32 x i32> undef, i32 16, i64 0
|
%insert = insertelement <32 x i32> undef, i32 16, i64 0
|
||||||
|
@ -240,12 +277,20 @@ define void @smulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
|
||||||
|
|
||||||
define void @smulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
|
define void @smulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
|
||||||
; CHECK-LABEL: smulh_v64i16:
|
; CHECK-LABEL: smulh_v64i16:
|
||||||
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]]
|
; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]]
|
||||||
; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
|
; VBITS_EQ_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
|
||||||
; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
|
; VBITS_EQ_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
|
||||||
; VBITS_GE_1024: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
|
; VBITS_EQ_1024: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
|
||||||
; VBITS_GE_1024: st1h { [[RES]].h }, [[PG]], [x0]
|
; VBITS_EQ_1024: st1h { [[RES]].h }, [[PG]], [x0]
|
||||||
; VBITS_GE_1024: ret
|
; VBITS_EQ_1024: ret
|
||||||
|
|
||||||
|
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,64)]]
|
||||||
|
; VBITS_GE_2048-DAG: ld1sh { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
|
||||||
|
; VBITS_GE_2048-DAG: ld1sh { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
|
||||||
|
; VBITS_GE_2048: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
|
||||||
|
; VBITS_GE_2048: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
|
||||||
|
; VBITS_GE_2048: st1h { [[RES]].s }, [[PG]], [x0]
|
||||||
|
; VBITS_GE_2048: ret
|
||||||
%op1 = load <64 x i16>, <64 x i16>* %a
|
%op1 = load <64 x i16>, <64 x i16>* %a
|
||||||
%op2 = load <64 x i16>, <64 x i16>* %b
|
%op2 = load <64 x i16>, <64 x i16>* %b
|
||||||
%insert = insertelement <64 x i32> undef, i32 16, i64 0
|
%insert = insertelement <64 x i32> undef, i32 16, i64 0
|
||||||
|
@ -318,12 +363,20 @@ define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
|
||||||
|
|
||||||
define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
|
define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
|
||||||
; CHECK-LABEL: smulh_v8i32:
|
; CHECK-LABEL: smulh_v8i32:
|
||||||
; VBITS_GE_256: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,8)]]
|
; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,8)]]
|
||||||
; VBITS_GE_256-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
|
; VBITS_EQ_256-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
|
||||||
; VBITS_GE_256-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
|
; VBITS_EQ_256-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
|
||||||
; VBITS_GE_256: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
|
; VBITS_EQ_256: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
|
||||||
; VBITS_GE_256: st1w { [[RES]].s }, [[PG]], [x0]
|
; VBITS_EQ_256: st1w { [[RES]].s }, [[PG]], [x0]
|
||||||
; VBITS_GE_256: ret
|
; VBITS_EQ_256: ret
|
||||||
|
|
||||||
|
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,8)]]
|
||||||
|
; VBITS_GE_512-DAG: ld1sw { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
|
||||||
|
; VBITS_GE_512-DAG: ld1sw { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
|
||||||
|
; VBITS_GE_512: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
|
||||||
|
; VBITS_GE_512: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
|
||||||
|
; VBITS_GE_512: st1w { [[RES]].d }, [[PG]], [x0]
|
||||||
|
; VBITS_GE_512: ret
|
||||||
%op1 = load <8 x i32>, <8 x i32>* %a
|
%op1 = load <8 x i32>, <8 x i32>* %a
|
||||||
%op2 = load <8 x i32>, <8 x i32>* %b
|
%op2 = load <8 x i32>, <8 x i32>* %b
|
||||||
%insert = insertelement <8 x i64> undef, i64 32, i64 0
|
%insert = insertelement <8 x i64> undef, i64 32, i64 0
|
||||||
|
@ -339,12 +392,19 @@ define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
|
||||||
|
|
||||||
define void @smulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
|
define void @smulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
|
||||||
; CHECK-LABEL: smulh_v16i32:
|
; CHECK-LABEL: smulh_v16i32:
|
||||||
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]
|
; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]
|
||||||
; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
|
; VBITS_EQ_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
|
||||||
; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
|
; VBITS_EQ_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
|
||||||
; VBITS_GE_512: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
|
; VBITS_EQ_512: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
|
||||||
; VBITS_GE_512: st1w { [[RES]].s }, [[PG]], [x0]
|
; VBITS_EQ_512: st1w { [[RES]].s }, [[PG]], [x0]
|
||||||
; VBITS_GE_512: ret
|
; VBITS_EQ_512: ret
|
||||||
|
|
||||||
|
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,16)]]
|
||||||
|
; VBITS_GE_1024-DAG: ld1sw { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
|
||||||
|
; VBITS_GE_1024-DAG: ld1sw { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
|
||||||
|
; VBITS_GE_1024: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
|
||||||
|
; VBITS_GE_1024: st1w { [[RES]].d }, [[PG]], [x0]
|
||||||
|
; VBITS_GE_1024: ret
|
||||||
%op1 = load <16 x i32>, <16 x i32>* %a
|
%op1 = load <16 x i32>, <16 x i32>* %a
|
||||||
%op2 = load <16 x i32>, <16 x i32>* %b
|
%op2 = load <16 x i32>, <16 x i32>* %b
|
||||||
%insert = insertelement <16 x i64> undef, i64 32, i64 0
|
%insert = insertelement <16 x i64> undef, i64 32, i64 0
|
||||||
|
@ -360,12 +420,20 @@ define void @smulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
|
||||||
|
|
||||||
define void @smulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
|
define void @smulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
|
||||||
; CHECK-LABEL: smulh_v32i32:
|
; CHECK-LABEL: smulh_v32i32:
|
||||||
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]
|
; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]
|
||||||
; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
|
; VBITS_EQ_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
|
||||||
; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
|
; VBITS_EQ_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
|
||||||
; VBITS_GE_1024: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
|
; VBITS_EQ_1024: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
|
||||||
; VBITS_GE_1024: st1w { [[RES]].s }, [[PG]], [x0]
|
; VBITS_EQ_1024: st1w { [[RES]].s }, [[PG]], [x0]
|
||||||
; VBITS_GE_1024: ret
|
; VBITS_EQ_1024: ret
|
||||||
|
|
||||||
|
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,32)]]
|
||||||
|
; VBITS_GE_2048-DAG: ld1sw { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
|
||||||
|
; VBITS_GE_2048-DAG: ld1sw { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
|
||||||
|
; VBITS_GE_2048: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
|
||||||
|
; VBITS_GE_2048: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
|
||||||
|
; VBITS_GE_2048: st1w { [[RES]].d }, [[PG]], [x0]
|
||||||
|
; VBITS_GE_2048: ret
|
||||||
%op1 = load <32 x i32>, <32 x i32>* %a
|
%op1 = load <32 x i32>, <32 x i32>* %a
|
||||||
%op2 = load <32 x i32>, <32 x i32>* %b
|
%op2 = load <32 x i32>, <32 x i32>* %b
|
||||||
%insert = insertelement <32 x i64> undef, i64 32, i64 0
|
%insert = insertelement <32 x i64> undef, i64 32, i64 0
|
||||||
|
@ -563,12 +631,20 @@ define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
|
||||||
|
|
||||||
define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
|
define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
|
||||||
; CHECK-LABEL: umulh_v32i8:
|
; CHECK-LABEL: umulh_v32i8:
|
||||||
; VBITS_GE_256: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
|
; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
|
||||||
; VBITS_GE_256-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
|
; VBITS_EQ_256-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
|
||||||
; VBITS_GE_256-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
|
; VBITS_EQ_256-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
|
||||||
; VBITS_GE_256: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
|
; VBITS_EQ_256: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
|
||||||
; VBITS_GE_256: st1b { [[RES]].b }, [[PG]], [x0]
|
; VBITS_EQ_256: st1b { [[RES]].b }, [[PG]], [x0]
|
||||||
; VBITS_GE_256: ret
|
; VBITS_EQ_256: ret
|
||||||
|
|
||||||
|
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]
|
||||||
|
; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
|
||||||
|
; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
|
||||||
|
; VBITS_GE_512: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
|
||||||
|
; VBIGS_GE_512: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
|
||||||
|
; VBITS_GE_512: st1b { [[RES]].h }, [[PG]], [x0]
|
||||||
|
; VBITS_GE_512: ret
|
||||||
%op1 = load <32 x i8>, <32 x i8>* %a
|
%op1 = load <32 x i8>, <32 x i8>* %a
|
||||||
%op2 = load <32 x i8>, <32 x i8>* %b
|
%op2 = load <32 x i8>, <32 x i8>* %b
|
||||||
%insert = insertelement <32 x i16> undef, i16 8, i64 0
|
%insert = insertelement <32 x i16> undef, i16 8, i64 0
|
||||||
|
@ -584,12 +660,18 @@ define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
|
||||||
|
|
||||||
define void @umulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
|
define void @umulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
|
||||||
; CHECK-LABEL: umulh_v64i8:
|
; CHECK-LABEL: umulh_v64i8:
|
||||||
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
|
; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
|
||||||
; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
|
; VBITS_EQ_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
|
||||||
; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
|
; VBITS_EQ_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
|
||||||
; VBITS_GE_512: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
|
; VBITS_EQ_512: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
|
||||||
; VBITS_GE_512: st1b { [[RES]].b }, [[PG]], [x0]
|
; VBITS_EQ_512: ret
|
||||||
; VBITS_GE_512: ret
|
|
||||||
|
; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
|
||||||
|
; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
|
||||||
|
; VBITS_GE_1024: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
|
||||||
|
; VBIGS_GE_1024: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
|
||||||
|
; VBITS_GE_1024: st1b { [[RES]].h }, [[PG]], [x0]
|
||||||
|
; VBITS_GE_1024: ret
|
||||||
%op1 = load <64 x i8>, <64 x i8>* %a
|
%op1 = load <64 x i8>, <64 x i8>* %a
|
||||||
%op2 = load <64 x i8>, <64 x i8>* %b
|
%op2 = load <64 x i8>, <64 x i8>* %b
|
||||||
%insert = insertelement <64 x i16> undef, i16 8, i64 0
|
%insert = insertelement <64 x i16> undef, i16 8, i64 0
|
||||||
|
@ -605,12 +687,20 @@ define void @umulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
|
||||||
|
|
||||||
define void @umulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
|
define void @umulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
|
||||||
; CHECK-LABEL: umulh_v128i8:
|
; CHECK-LABEL: umulh_v128i8:
|
||||||
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
|
; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
|
||||||
; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
|
; VBITS_EQ_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
|
||||||
; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
|
; VBITS_EQ_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
|
||||||
; VBITS_GE_1024: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
|
; VBITS_EQ_1024: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
|
||||||
; VBITS_GE_1024: st1b { [[RES]].b }, [[PG]], [x0]
|
; VBITS_EQ_1024: st1b { [[RES]].b }, [[PG]], [x0]
|
||||||
; VBITS_GE_1024: ret
|
; VBITS_EQ_1024: ret
|
||||||
|
|
||||||
|
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,128)]]
|
||||||
|
; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
|
||||||
|
; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
|
||||||
|
; VBITS_GE_2048: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
|
||||||
|
; VBIGS_GE_2048: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
|
||||||
|
; VBITS_GE_2048: st1b { [[RES]].h }, [[PG]], [x0]
|
||||||
|
; VBITS_GE_2048: ret
|
||||||
%op1 = load <128 x i8>, <128 x i8>* %a
|
%op1 = load <128 x i8>, <128 x i8>* %a
|
||||||
%op2 = load <128 x i8>, <128 x i8>* %b
|
%op2 = load <128 x i8>, <128 x i8>* %b
|
||||||
%insert = insertelement <128 x i16> undef, i16 8, i64 0
|
%insert = insertelement <128 x i16> undef, i16 8, i64 0
|
||||||
|
@ -686,12 +776,20 @@ define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
|
||||||
|
|
||||||
define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
|
define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
|
||||||
; CHECK-LABEL: umulh_v16i16:
|
; CHECK-LABEL: umulh_v16i16:
|
||||||
; VBITS_GE_256: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,16)]]
|
; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,16)]]
|
||||||
; VBITS_GE_256-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
|
; VBITS_EQ_256-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
|
||||||
; VBITS_GE_256-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
|
; VBITS_EQ_256-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
|
||||||
; VBITS_GE_256: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
|
; VBITS_EQ_256: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
|
||||||
; VBITS_GE_256: st1h { [[RES]].h }, [[PG]], [x0]
|
; VBITS_EQ_256: st1h { [[RES]].h }, [[PG]], [x0]
|
||||||
; VBITS_GE_256: ret
|
; VBITS_EQ_256: ret
|
||||||
|
|
||||||
|
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]
|
||||||
|
; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
|
||||||
|
; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
|
||||||
|
; VBITS_GE_512: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
|
||||||
|
; VBITS_GE_512: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
|
||||||
|
; VBITS_GE_512: st1h { [[RES]].s }, [[PG]], [x0]
|
||||||
|
; VBITS_GE_512: ret
|
||||||
%op1 = load <16 x i16>, <16 x i16>* %a
|
%op1 = load <16 x i16>, <16 x i16>* %a
|
||||||
%op2 = load <16 x i16>, <16 x i16>* %b
|
%op2 = load <16 x i16>, <16 x i16>* %b
|
||||||
%insert = insertelement <16 x i32> undef, i32 16, i64 0
|
%insert = insertelement <16 x i32> undef, i32 16, i64 0
|
||||||
|
@ -707,12 +805,20 @@ define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
|
||||||
|
|
||||||
define void @umulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
|
define void @umulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
|
||||||
; CHECK-LABEL: umulh_v32i16:
|
; CHECK-LABEL: umulh_v32i16:
|
||||||
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]
|
; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]
|
||||||
; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
|
; VBITS_EQ_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
|
||||||
; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
|
; VBITS_EQ_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
|
||||||
; VBITS_GE_512: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
|
; VBITS_EQ_512: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
|
||||||
; VBITS_GE_512: st1h { [[RES]].h }, [[PG]], [x0]
|
; VBITS_EQ_512: st1h { [[RES]].h }, [[PG]], [x0]
|
||||||
; VBITS_GE_512: ret
|
; VBITS_EQ_512: ret
|
||||||
|
|
||||||
|
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]
|
||||||
|
; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
|
||||||
|
; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
|
||||||
|
; VBITS_GE_1024: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
|
||||||
|
; VBITS_GE_1024: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
|
||||||
|
; VBITS_GE_1024: st1h { [[RES]].s }, [[PG]], [x0]
|
||||||
|
; VBITS_GE_1024: ret
|
||||||
%op1 = load <32 x i16>, <32 x i16>* %a
|
%op1 = load <32 x i16>, <32 x i16>* %a
|
||||||
%op2 = load <32 x i16>, <32 x i16>* %b
|
%op2 = load <32 x i16>, <32 x i16>* %b
|
||||||
%insert = insertelement <32 x i32> undef, i32 16, i64 0
|
%insert = insertelement <32 x i32> undef, i32 16, i64 0
|
||||||
|
@ -728,12 +834,19 @@ define void @umulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
|
||||||
|
|
||||||
define void @umulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
|
define void @umulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
|
||||||
; CHECK-LABEL: umulh_v64i16:
|
; CHECK-LABEL: umulh_v64i16:
|
||||||
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]]
|
; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]]
|
||||||
; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
|
; VBITS_EQ_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
|
||||||
; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
|
; VBITS_EQ_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
|
||||||
; VBITS_GE_1024: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
|
; VBITS_EQ_1024: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
|
||||||
; VBITS_GE_1024: st1h { [[RES]].h }, [[PG]], [x0]
|
; VBITS_EQ_1024: ret
|
||||||
; VBITS_GE_1024: ret
|
|
||||||
|
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,64)]]
|
||||||
|
; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
|
||||||
|
; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
|
||||||
|
; VBITS_GE_2048: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
|
||||||
|
; VBITS_GE_2048: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
|
||||||
|
; VBITS_GE_2048: st1h { [[RES]].s }, [[PG]], [x0]
|
||||||
|
; VBITS_GE_2048: ret
|
||||||
%op1 = load <64 x i16>, <64 x i16>* %a
|
%op1 = load <64 x i16>, <64 x i16>* %a
|
||||||
%op2 = load <64 x i16>, <64 x i16>* %b
|
%op2 = load <64 x i16>, <64 x i16>* %b
|
||||||
%insert = insertelement <64 x i32> undef, i32 16, i64 0
|
%insert = insertelement <64 x i32> undef, i32 16, i64 0
|
||||||
|
@ -806,12 +919,20 @@ define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
|
||||||
|
|
||||||
define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
|
define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
|
||||||
; CHECK-LABEL: umulh_v8i32:
|
; CHECK-LABEL: umulh_v8i32:
|
||||||
; VBITS_GE_256: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,8)]]
|
; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,8)]]
|
||||||
; VBITS_GE_256-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
|
; VBITS_EQ_256-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
|
||||||
; VBITS_GE_256-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
|
; VBITS_EQ_256-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
|
||||||
; VBITS_GE_256: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
|
; VBITS_EQ_256: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
|
||||||
; VBITS_GE_256: st1w { [[RES]].s }, [[PG]], [x0]
|
; VBITS_EQ_256: st1w { [[RES]].s }, [[PG]], [x0]
|
||||||
; VBITS_GE_256: ret
|
; VBITS_EQ_256: ret
|
||||||
|
|
||||||
|
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,8)]]
|
||||||
|
; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
|
||||||
|
; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
|
||||||
|
; VBITS_GE_512: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
|
||||||
|
; VBITS_GE_512: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
|
||||||
|
; VBITS_GE_512: st1w { [[RES]].d }, [[PG]], [x0]
|
||||||
|
; VBITS_GE_512: ret
|
||||||
%op1 = load <8 x i32>, <8 x i32>* %a
|
%op1 = load <8 x i32>, <8 x i32>* %a
|
||||||
%op2 = load <8 x i32>, <8 x i32>* %b
|
%op2 = load <8 x i32>, <8 x i32>* %b
|
||||||
%insert = insertelement <8 x i64> undef, i64 32, i64 0
|
%insert = insertelement <8 x i64> undef, i64 32, i64 0
|
||||||
|
@ -827,12 +948,19 @@ define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
|
||||||
|
|
||||||
define void @umulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
|
define void @umulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
|
||||||
; CHECK-LABEL: umulh_v16i32:
|
; CHECK-LABEL: umulh_v16i32:
|
||||||
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]
|
; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]
|
||||||
; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
|
; VBITS_EQ_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
|
||||||
; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
|
; VBITS_EQ_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
|
||||||
; VBITS_GE_512: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
|
; VBITS_EQ_512: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
|
||||||
; VBITS_GE_512: st1w { [[RES]].s }, [[PG]], [x0]
|
; VBITS_EQ_512: st1w { [[RES]].s }, [[PG]], [x0]
|
||||||
; VBITS_GE_512: ret
|
|
||||||
|
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,16)]]
|
||||||
|
; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
|
||||||
|
; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
|
||||||
|
; VBITS_GE_1024: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
|
||||||
|
; VBITS_GE_1024: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
|
||||||
|
; VBITS_GE_1024: st1w { [[RES]].d }, [[PG]], [x0]
|
||||||
|
; VBITS_GE_1024: ret
|
||||||
%op1 = load <16 x i32>, <16 x i32>* %a
|
%op1 = load <16 x i32>, <16 x i32>* %a
|
||||||
%op2 = load <16 x i32>, <16 x i32>* %b
|
%op2 = load <16 x i32>, <16 x i32>* %b
|
||||||
%insert = insertelement <16 x i64> undef, i64 32, i64 0
|
%insert = insertelement <16 x i64> undef, i64 32, i64 0
|
||||||
|
@ -848,12 +976,20 @@ define void @umulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
|
||||||
|
|
||||||
define void @umulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
|
define void @umulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
|
||||||
; CHECK-LABEL: umulh_v32i32:
|
; CHECK-LABEL: umulh_v32i32:
|
||||||
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]
|
; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]
|
||||||
; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
|
; VBITS_EQ_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
|
||||||
; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
|
; VBITS_EQ_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
|
||||||
; VBITS_GE_1024: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
|
; VBITS_EQ_1024: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
|
||||||
; VBITS_GE_1024: st1w { [[RES]].s }, [[PG]], [x0]
|
; VBITS_EQ_1024: st1w { [[RES]].s }, [[PG]], [x0]
|
||||||
; VBITS_GE_1024: ret
|
; VBITS_EQ_1024: ret
|
||||||
|
|
||||||
|
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,32)]]
|
||||||
|
; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
|
||||||
|
; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
|
||||||
|
; VBITS_GE_2048: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
|
||||||
|
; VBITS_GE_2048: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
|
||||||
|
; VBITS_GE_2048: st1w { [[RES]].d }, [[PG]], [x0]
|
||||||
|
; VBITS_GE_2048: ret
|
||||||
%op1 = load <32 x i32>, <32 x i32>* %a
|
%op1 = load <32 x i32>, <32 x i32>* %a
|
||||||
%op2 = load <32 x i32>, <32 x i32>* %b
|
%op2 = load <32 x i32>, <32 x i32>* %b
|
||||||
%insert = insertelement <32 x i64> undef, i64 32, i64 0
|
%insert = insertelement <32 x i64> undef, i64 32, i64 0
|
||||||
|
|
Loading…
Reference in New Issue