[llvm][sve] Lowering for VLS extending loads

This patch enables extending loads for fixed length SVE code generation.

There is a slight regression here in the mulh tests; since these tests
load the parameter and then extend it these are treated as extending
loads which are merged, preventing the mulh instruction from being
generated. As this affects scalable SVE codegen as well this should be
addressed in a separate patch.

Reviewed By: bsmith

Differential Revision: https://reviews.llvm.org/D107057
This commit is contained in:
David Truby 2021-08-11 14:59:00 +00:00
parent 39bbbc2c2a
commit 9c47d6b48d
3 changed files with 477 additions and 112 deletions

View File

@ -1516,6 +1516,8 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
MVT InnerVT = VT.changeVectorElementType(MVT::i8);
while (InnerVT != VT) {
setTruncStoreAction(VT, InnerVT, Custom);
setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
InnerVT = InnerVT.changeVectorElementType(
MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
}
@ -4176,7 +4178,9 @@ bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {
}
bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
return ExtVal.getValueType().isScalableVector();
return ExtVal.getValueType().isScalableVector() ||
useSVEForFixedLengthVectorVT(ExtVal.getValueType(),
/*OverrideNEON=*/true);
}
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {

View File

@ -0,0 +1,225 @@
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE when its registers are no bigger than NEON.
; NO_SVE-NOT: ptrue
define <4 x i32> @load_zext_v4i16i32(<4 x i16>* %ap) #0 {
; CHECK-LABEL: load_zext_v4i16i32
; CHECK: ldr d[[D0:[0-9]+]], [x0]
; CHECK-NEXT: ushll v[[D0]].4s, v[[D0]].4h, #0
; CHECK-NEXT: ret
%a = load <4 x i16>, <4 x i16>* %ap
%val = zext <4 x i16> %a to <4 x i32>
ret <4 x i32> %val
}
define <8 x i32> @load_zext_v8i16i32(<8 x i16>* %ap) #0 {
; CHECK-LABEL: load_zext_v8i16i32
; CHECK: ptrue [[P0:p[0-9]+]].s, vl8
; CHECK-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
; CHECK-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
; CHECK-NEXT: ret
%a = load <8 x i16>, <8 x i16>* %ap
%val = zext <8 x i16> %a to <8 x i32>
ret <8 x i32> %val
}
define <16 x i32> @load_zext_v16i16i32(<16 x i16>* %ap) #0 {
; CHECK-LABEL: load_zext_v16i16i32
; VBITS_GE_512: ptrue [[P0:p[0-9]+]].s, vl16
; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalistaion
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
; VBITS_EQ_256-DAG: ld1h { [[Z0:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_EQ_256-DAG: mov x9, sp
; VBITS_EQ_256-DAG: st1h { [[Z0]].h }, [[PG]], [x9]
; VBITS_EQ_256-DAG: ldp q[[R0:[0-9]+]], q[[R1:[0-9]+]], [sp]
; VBITS_EQ_256-DAG: add x9, x8, #32
; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8
; VBITS_EQ_256-DAG: uunpklo z[[R0]].s, z[[R0]].h
; VBITS_EQ_256-DAG: uunpklo z[[R1]].s, z[[R1]].h
; VBITS_EQ_256-DAG: st1w { z[[R1]].s }, [[PG1]], [x9]
; VBITS_EQ_256-DAG: st1w { z[[R0]].s }, [[PG1]], [x8]
; VBITS_EQ_256-DAG: ret
%a = load <16 x i16>, <16 x i16>* %ap
%val = zext <16 x i16> %a to <16 x i32>
ret <16 x i32> %val
}
define <32 x i32> @load_zext_v32i16i32(<32 x i16>* %ap) #0 {
; CHECK-LABEL: load_zext_v32i16i32
; VBITS_GE_1024: ptrue [[P0:p[0-9]+]].s, vl32
; VBITS_GE_1024-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
; VBITS_GE_1024-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
; VBITS_GE_1024-NEXT: ret
%a = load <32 x i16>, <32 x i16>* %ap
%val = zext <32 x i16> %a to <32 x i32>
ret <32 x i32> %val
}
define <64 x i32> @load_zext_v64i16i32(<64 x i16>* %ap) #0 {
; CHECK-LABEL: load_zext_v64i16i32
; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].s, vl64
; VBITS_GE_2048-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
; VBITS_GE_2048-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
; VBITS_GE_2048-NEXT: ret
%a = load <64 x i16>, <64 x i16>* %ap
%val = zext <64 x i16> %a to <64 x i32>
ret <64 x i32> %val
}
define <4 x i32> @load_sext_v4i16i32(<4 x i16>* %ap) #0 {
; CHECK-LABEL: load_sext_v4i16i32
; CHECK: ldr d[[D0:[0-9]+]], [x0]
; CHECK-NEXT: sshll v[[D0]].4s, v[[D0]].4h, #0
; CHECK-NEXT: ret
%a = load <4 x i16>, <4 x i16>* %ap
%val = sext <4 x i16> %a to <4 x i32>
ret <4 x i32> %val
}
define <8 x i32> @load_sext_v8i16i32(<8 x i16>* %ap) #0 {
; CHECK-LABEL: load_sext_v8i16i32
; CHECK: ptrue [[P0:p[0-9]+]].s, vl8
; CHECK-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
; CHECK-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
; CHECK-NEXT: ret
%a = load <8 x i16>, <8 x i16>* %ap
%val = sext <8 x i16> %a to <8 x i32>
ret <8 x i32> %val
}
define <16 x i32> @load_sext_v16i16i32(<16 x i16>* %ap) #0 {
; CHECK-LABEL: load_sext_v16i16i32
; VBITS_GE_512: ptrue [[P0:p[0-9]+]].s, vl16
; VBITS_GE_512-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
; VBITS_GE_512-NEXT: ret
; Ensure sensible type legalistaion
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
; VBITS_EQ_256-DAG: ld1h { [[Z0:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_EQ_256-DAG: mov x9, sp
; VBITS_EQ_256-DAG: st1h { [[Z0]].h }, [[PG]], [x9]
; VBITS_EQ_256-DAG: ldp q[[R0:[0-9]+]], q[[R1:[0-9]+]], [sp]
; VBITS_EQ_256-DAG: add x9, x8, #32
; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8
; VBITS_EQ_256-DAG: sunpklo z[[R0]].s, z[[R0]].h
; VBITS_EQ_256-DAG: sunpklo z[[R1]].s, z[[R1]].h
; VBITS_EQ_256-DAG: st1w { z[[R1]].s }, [[PG1]], [x9]
; VBITS_EQ_256-DAG: st1w { z[[R0]].s }, [[PG1]], [x8]
; VBITS_EQ_256-DAG: ret
%a = load <16 x i16>, <16 x i16>* %ap
%val = sext <16 x i16> %a to <16 x i32>
ret <16 x i32> %val
}
define <32 x i32> @load_sext_v32i16i32(<32 x i16>* %ap) #0 {
; CHECK-LABEL: load_sext_v32i16i32
; VBITS_GE_1024: ptrue [[P0:p[0-9]+]].s, vl32
; VBITS_GE_1024-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
; VBITS_GE_1024-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
; VBITS_GE_1024-NEXT: ret
%a = load <32 x i16>, <32 x i16>* %ap
%val = sext <32 x i16> %a to <32 x i32>
ret <32 x i32> %val
}
define <64 x i32> @load_sext_v64i16i32(<64 x i16>* %ap) #0 {
; CHECK-LABEL: load_sext_v64i16i32
; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].s, vl64
; VBITS_GE_2048-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
; VBITS_GE_2048-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
; VBITS_GE_2048-NEXT: ret
%a = load <64 x i16>, <64 x i16>* %ap
%val = sext <64 x i16> %a to <64 x i32>
ret <64 x i32> %val
}
define <32 x i64> @load_zext_v32i8i64(<32 x i8>* %ap) #0 {
; CHECK-LABEL: load_zext_v32i8i64
; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32
; VBITS_GE_2048-NEXT: ld1b { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0]
; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8]
; VBITS_GE_2048-NEXT: ret
%a = load <32 x i8>, <32 x i8>* %ap
%val = zext <32 x i8> %a to <32 x i64>
ret <32 x i64> %val
}
define <32 x i64> @load_sext_v32i8i64(<32 x i8>* %ap) #0 {
; CHECK-LABEL: load_sext_v32i8i64
; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32
; VBITS_GE_2048-NEXT: ld1sb { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0]
; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8]
; VBITS_GE_2048-NEXT: ret
%a = load <32 x i8>, <32 x i8>* %ap
%val = sext <32 x i8> %a to <32 x i64>
ret <32 x i64> %val
}
define <32 x i64> @load_zext_v32i16i64(<32 x i16>* %ap) #0 {
; CHECK-LABEL: load_zext_v32i16i64
; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32
; VBITS_GE_2048-NEXT: ld1h { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0]
; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8]
; VBITS_GE_2048-NEXT: ret
%a = load <32 x i16>, <32 x i16>* %ap
%val = zext <32 x i16> %a to <32 x i64>
ret <32 x i64> %val
}
define <32 x i64> @load_sext_v32i16i64(<32 x i16>* %ap) #0 {
; CHECK-LABEL: load_sext_v32i16i64
; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32
; VBITS_GE_2048-NEXT: ld1sh { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0]
; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8]
; VBITS_GE_2048-NEXT: ret
%a = load <32 x i16>, <32 x i16>* %ap
%val = sext <32 x i16> %a to <32 x i64>
ret <32 x i64> %val
}
define <32 x i64> @load_zext_v32i32i64(<32 x i32>* %ap) #0 {
; CHECK-LABEL: load_zext_v32i32i64
; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32
; VBITS_GE_2048-NEXT: ld1w { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0]
; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8]
; VBITS_GE_2048-NEXT: ret
%a = load <32 x i32>, <32 x i32>* %ap
%val = zext <32 x i32> %a to <32 x i64>
ret <32 x i64> %val
}
define <32 x i64> @load_sext_v32i32i64(<32 x i32>* %ap) #0 {
; CHECK-LABEL: load_sext_v32i32i64
; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32
; VBITS_GE_2048-NEXT: ld1sw { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0]
; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8]
; VBITS_GE_2048-NEXT: ret
%a = load <32 x i32>, <32 x i32>* %ap
%val = sext <32 x i32> %a to <32 x i64>
ret <32 x i64> %val
}
attributes #0 = { "target-features"="+sve" }

View File

@ -1,11 +1,11 @@
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256,VBITS_EQ_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256,VBITS_EQ_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256,VBITS_EQ_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
@ -75,12 +75,19 @@ define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
; CHECK-LABEL: smulh_v32i8:
; VBITS_GE_256: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
; VBITS_GE_256-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_GE_256-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; VBITS_GE_256: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
; VBITS_GE_256: st1b { [[RES]].b }, [[PG]], [x0]
; VBITS_GE_256: ret
; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
; VBITS_EQ_256-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_EQ_256-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; VBITS_EQ_256: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
; VBITS_EQ_256: ret
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]
; VBITS_GE_512-DAG: ld1sb { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1sb { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_GE_512: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; VBITS_GE_512: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
; VBITS_GE_512: st1b { [[RES]].h }, [[PG]], [x0]
; VBITS_GE_512: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%insert = insertelement <32 x i16> undef, i16 8, i64 0
@ -96,12 +103,19 @@ define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
; CHECK-LABEL: smulh_v64i8:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; VBITS_GE_512: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
; VBITS_GE_512: st1b { [[RES]].b }, [[PG]], [x0]
; VBITS_GE_512: ret
; VBITS_EQ_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_EQ_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; VBITS_EQ_512: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
; VBITS_EQ_512: st1b { [[RES]].b }, [[PG]], [x0]
; VBITS_EQ_512: ret
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]]
; VBITS_GE_1024-DAG: ld1sb { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_1024-DAG: ld1sb { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_GE_1024: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; VBITS_GE_1024: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
; VBITS_GE_1024: st1b { [[RES]].h }, [[PG]], [x0]
; VBITS_GE_1024: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%insert = insertelement <64 x i16> undef, i16 8, i64 0
@ -117,12 +131,19 @@ define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
define void @smulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
; CHECK-LABEL: smulh_v128i8:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; VBITS_GE_1024: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
; VBITS_GE_1024: st1b { [[RES]].b }, [[PG]], [x0]
; VBITS_GE_1024: ret
; VBITS_EQ_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_EQ_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; VBITS_EQ_1024: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
; VBITS_EQ_1024: st1b { [[RES]].b }, [[PG]], [x0]
; VBITS_EQ_1024: ret
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,128)]]
; VBITS_GE_2048-DAG: ld1sb { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_2048-DAG: ld1sb { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_GE_2048: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; VBITS_GE_2048: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
; VBITS_GE_2048: st1b { [[RES]].h }, [[PG]], [x0]
; VBITS_GE_2048: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%insert = insertelement <128 x i16> undef, i16 8, i64 0
@ -198,12 +219,20 @@ define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
; CHECK-LABEL: smulh_v16i16:
; VBITS_GE_256: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,16)]]
; VBITS_GE_256-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_256-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_GE_256: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; VBITS_GE_256: st1h { [[RES]].h }, [[PG]], [x0]
; VBITS_GE_256: ret
; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,16)]]
; VBITS_EQ_256-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_EQ_256-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_EQ_256: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; VBITS_EQ_256: st1h { [[RES]].h }, [[PG]], [x0]
; VBITS_EQ_256: ret
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]
; VBITS_GE_512-DAG: ld1sh { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1sh { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_GE_512: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; VBITS_GE_512: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
; VBITS_GE_512: st1h { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_512: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%insert = insertelement <16 x i32> undef, i32 16, i64 0
@ -219,12 +248,20 @@ define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
define void @smulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
; CHECK-LABEL: smulh_v32i16:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]
; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_GE_512: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; VBITS_GE_512: st1h { [[RES]].h }, [[PG]], [x0]
; VBITS_GE_512: ret
; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]
; VBITS_EQ_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_EQ_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_EQ_512: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; VBITS_EQ_512: st1h { [[RES]].h }, [[PG]], [x0]
; VBITS_EQ_512: ret
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]
; VBITS_GE_1024-DAG: ld1sh { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_1024-DAG: ld1sh { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_GE_1024: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; VBITS_GE_1024: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
; VBITS_GE_1024: st1h { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_1024: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%insert = insertelement <32 x i32> undef, i32 16, i64 0
@ -240,12 +277,20 @@ define void @smulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
define void @smulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
; CHECK-LABEL: smulh_v64i16:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]]
; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_GE_1024: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; VBITS_GE_1024: st1h { [[RES]].h }, [[PG]], [x0]
; VBITS_GE_1024: ret
; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]]
; VBITS_EQ_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_EQ_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_EQ_1024: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; VBITS_EQ_1024: st1h { [[RES]].h }, [[PG]], [x0]
; VBITS_EQ_1024: ret
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,64)]]
; VBITS_GE_2048-DAG: ld1sh { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_2048-DAG: ld1sh { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_GE_2048: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; VBITS_GE_2048: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
; VBITS_GE_2048: st1h { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_2048: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%insert = insertelement <64 x i32> undef, i32 16, i64 0
@ -318,12 +363,20 @@ define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
; CHECK-LABEL: smulh_v8i32:
; VBITS_GE_256: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,8)]]
; VBITS_GE_256-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_256-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_GE_256: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; VBITS_GE_256: st1w { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_256: ret
; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,8)]]
; VBITS_EQ_256-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_EQ_256-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_EQ_256: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; VBITS_EQ_256: st1w { [[RES]].s }, [[PG]], [x0]
; VBITS_EQ_256: ret
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,8)]]
; VBITS_GE_512-DAG: ld1sw { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1sw { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; VBITS_GE_512: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
; VBITS_GE_512: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
; VBITS_GE_512: st1w { [[RES]].d }, [[PG]], [x0]
; VBITS_GE_512: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%insert = insertelement <8 x i64> undef, i64 32, i64 0
@ -339,12 +392,19 @@ define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
define void @smulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
; CHECK-LABEL: smulh_v16i32:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]
; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_GE_512: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; VBITS_GE_512: st1w { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_512: ret
; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]
; VBITS_EQ_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_EQ_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_EQ_512: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; VBITS_EQ_512: st1w { [[RES]].s }, [[PG]], [x0]
; VBITS_EQ_512: ret
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,16)]]
; VBITS_GE_1024-DAG: ld1sw { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_1024-DAG: ld1sw { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; VBITS_GE_1024: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
; VBITS_GE_1024: st1w { [[RES]].d }, [[PG]], [x0]
; VBITS_GE_1024: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%insert = insertelement <16 x i64> undef, i64 32, i64 0
@ -360,12 +420,20 @@ define void @smulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
define void @smulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
; CHECK-LABEL: smulh_v32i32:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]
; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_GE_1024: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; VBITS_GE_1024: st1w { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_1024: ret
; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]
; VBITS_EQ_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_EQ_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_EQ_1024: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; VBITS_EQ_1024: st1w { [[RES]].s }, [[PG]], [x0]
; VBITS_EQ_1024: ret
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,32)]]
; VBITS_GE_2048-DAG: ld1sw { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_2048-DAG: ld1sw { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; VBITS_GE_2048: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
; VBITS_GE_2048: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
; VBITS_GE_2048: st1w { [[RES]].d }, [[PG]], [x0]
; VBITS_GE_2048: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%insert = insertelement <32 x i64> undef, i64 32, i64 0
@ -563,12 +631,20 @@ define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
; CHECK-LABEL: umulh_v32i8:
; VBITS_GE_256: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
; VBITS_GE_256-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_GE_256-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; VBITS_GE_256: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
; VBITS_GE_256: st1b { [[RES]].b }, [[PG]], [x0]
; VBITS_GE_256: ret
; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
; VBITS_EQ_256-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_EQ_256-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; VBITS_EQ_256: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
; VBITS_EQ_256: st1b { [[RES]].b }, [[PG]], [x0]
; VBITS_EQ_256: ret
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]
; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_GE_512: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; VBIGS_GE_512: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
; VBITS_GE_512: st1b { [[RES]].h }, [[PG]], [x0]
; VBITS_GE_512: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
%insert = insertelement <32 x i16> undef, i16 8, i64 0
@ -584,12 +660,18 @@ define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
define void @umulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
; CHECK-LABEL: umulh_v64i8:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; VBITS_GE_512: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
; VBITS_GE_512: st1b { [[RES]].b }, [[PG]], [x0]
; VBITS_GE_512: ret
; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
; VBITS_EQ_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_EQ_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; VBITS_EQ_512: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
; VBITS_EQ_512: ret
; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_GE_1024: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; VBIGS_GE_1024: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
; VBITS_GE_1024: st1b { [[RES]].h }, [[PG]], [x0]
; VBITS_GE_1024: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%insert = insertelement <64 x i16> undef, i16 8, i64 0
@ -605,12 +687,20 @@ define void @umulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
define void @umulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
; CHECK-LABEL: umulh_v128i8:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; VBITS_GE_1024: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
; VBITS_GE_1024: st1b { [[RES]].b }, [[PG]], [x0]
; VBITS_GE_1024: ret
; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
; VBITS_EQ_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_EQ_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; VBITS_EQ_1024: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
; VBITS_EQ_1024: st1b { [[RES]].b }, [[PG]], [x0]
; VBITS_EQ_1024: ret
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,128)]]
; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_GE_2048: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; VBIGS_GE_2048: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
; VBITS_GE_2048: st1b { [[RES]].h }, [[PG]], [x0]
; VBITS_GE_2048: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%insert = insertelement <128 x i16> undef, i16 8, i64 0
@ -686,12 +776,20 @@ define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
; CHECK-LABEL: umulh_v16i16:
; VBITS_GE_256: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,16)]]
; VBITS_GE_256-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_256-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_GE_256: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; VBITS_GE_256: st1h { [[RES]].h }, [[PG]], [x0]
; VBITS_GE_256: ret
; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,16)]]
; VBITS_EQ_256-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_EQ_256-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_EQ_256: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; VBITS_EQ_256: st1h { [[RES]].h }, [[PG]], [x0]
; VBITS_EQ_256: ret
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]
; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_GE_512: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; VBITS_GE_512: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
; VBITS_GE_512: st1h { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_512: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
%insert = insertelement <16 x i32> undef, i32 16, i64 0
@ -707,12 +805,20 @@ define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
define void @umulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
; CHECK-LABEL: umulh_v32i16:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]
; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_GE_512: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; VBITS_GE_512: st1h { [[RES]].h }, [[PG]], [x0]
; VBITS_GE_512: ret
; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]
; VBITS_EQ_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_EQ_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_EQ_512: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; VBITS_EQ_512: st1h { [[RES]].h }, [[PG]], [x0]
; VBITS_EQ_512: ret
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]
; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_GE_1024: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; VBITS_GE_1024: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
; VBITS_GE_1024: st1h { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_1024: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%insert = insertelement <32 x i32> undef, i32 16, i64 0
@ -728,12 +834,19 @@ define void @umulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
define void @umulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
; CHECK-LABEL: umulh_v64i16:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]]
; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_GE_1024: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; VBITS_GE_1024: st1h { [[RES]].h }, [[PG]], [x0]
; VBITS_GE_1024: ret
; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]]
; VBITS_EQ_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_EQ_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_EQ_1024: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; VBITS_EQ_1024: ret
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,64)]]
; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_GE_2048: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; VBITS_GE_2048: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
; VBITS_GE_2048: st1h { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_2048: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
%insert = insertelement <64 x i32> undef, i32 16, i64 0
@ -806,12 +919,20 @@ define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
; CHECK-LABEL: umulh_v8i32:
; VBITS_GE_256: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,8)]]
; VBITS_GE_256-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_256-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_GE_256: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; VBITS_GE_256: st1w { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_256: ret
; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,8)]]
; VBITS_EQ_256-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_EQ_256-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_EQ_256: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; VBITS_EQ_256: st1w { [[RES]].s }, [[PG]], [x0]
; VBITS_EQ_256: ret
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,8)]]
; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; VBITS_GE_512: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
; VBITS_GE_512: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
; VBITS_GE_512: st1w { [[RES]].d }, [[PG]], [x0]
; VBITS_GE_512: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%insert = insertelement <8 x i64> undef, i64 32, i64 0
@ -827,12 +948,19 @@ define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
define void @umulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
; CHECK-LABEL: umulh_v16i32:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]
; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_GE_512: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; VBITS_GE_512: st1w { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_512: ret
; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]
; VBITS_EQ_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_EQ_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_EQ_512: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; VBITS_EQ_512: st1w { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,16)]]
; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; VBITS_GE_1024: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
; VBITS_GE_1024: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
; VBITS_GE_1024: st1w { [[RES]].d }, [[PG]], [x0]
; VBITS_GE_1024: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
%insert = insertelement <16 x i64> undef, i64 32, i64 0
@ -848,12 +976,20 @@ define void @umulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
define void @umulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
; CHECK-LABEL: umulh_v32i32:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]
; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_GE_1024: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; VBITS_GE_1024: st1w { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_1024: ret
; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]
; VBITS_EQ_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_EQ_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_EQ_1024: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; VBITS_EQ_1024: st1w { [[RES]].s }, [[PG]], [x0]
; VBITS_EQ_1024: ret
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,32)]]
; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
; VBITS_GE_2048: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
; VBITS_GE_2048: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
; VBITS_GE_2048: st1w { [[RES]].d }, [[PG]], [x0]
; VBITS_GE_2048: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
%insert = insertelement <32 x i64> undef, i64 32, i64 0