From 9c47d6b48d6b0f0dafb87241f1561fc1b48f9ecd Mon Sep 17 00:00:00 2001 From: David Truby Date: Wed, 11 Aug 2021 14:59:00 +0000 Subject: [PATCH] [llvm][sve] Lowering for VLS extending loads This patch enables extending loads for fixed length SVE code generation. There is a slight regression here in the mulh tests; since these tests load the parameter and then extend it these are treated as extending loads which are merged, preventing the mulh instruction from being generated. As this affects scalable SVE codegen as well this should be addressed in a separate patch. Reviewed By: bsmith Differential Revision: https://reviews.llvm.org/D107057 --- .../Target/AArch64/AArch64ISelLowering.cpp | 6 +- .../AArch64/sve-fixed-length-ext-loads.ll | 225 +++++++++++ .../AArch64/sve-fixed-length-int-mulh.ll | 358 ++++++++++++------ 3 files changed, 477 insertions(+), 112 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 6921e6726135..732310c58ec3 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1516,6 +1516,8 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { MVT InnerVT = VT.changeVectorElementType(MVT::i8); while (InnerVT != VT) { setTruncStoreAction(VT, InnerVT, Custom); + setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom); InnerVT = InnerVT.changeVectorElementType( MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits())); } @@ -4176,7 +4178,9 @@ bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const { } bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { - return ExtVal.getValueType().isScalableVector(); + return ExtVal.getValueType().isScalableVector() || + useSVEForFixedLengthVectorVT(ExtVal.getValueType(), + /*OverrideNEON=*/true); } unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) { diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll new file mode 100644 index 000000000000..20178a98ca4f --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll @@ -0,0 +1,225 @@ +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK +; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512 + +target triple = "aarch64-unknown-linux-gnu" + +; Don't use SVE when its registers are no bigger than NEON. +; NO_SVE-NOT: ptrue + +define <4 x i32> @load_zext_v4i16i32(<4 x i16>* %ap) #0 { + ; CHECK-LABEL: load_zext_v4i16i32 + ; CHECK: ldr d[[D0:[0-9]+]], [x0] + ; CHECK-NEXT: ushll v[[D0]].4s, v[[D0]].4h, #0 + ; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %ap + %val = zext <4 x i16> %a to <4 x i32> + ret <4 x i32> %val +} + +define <8 x i32> @load_zext_v8i16i32(<8 x i16>* %ap) #0 { + ; CHECK-LABEL: load_zext_v8i16i32 + ; CHECK: ptrue [[P0:p[0-9]+]].s, vl8 + ; CHECK-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0] + ; CHECK-NEXT: st1w { [[Z0]].s }, [[P0]], [x8] + ; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %ap + %val = zext <8 x i16> %a to <8 x i32> + ret <8 x i32> %val +} + +define <16 x i32> @load_zext_v16i16i32(<16 x i16>* %ap) #0 { + ; CHECK-LABEL: load_zext_v16i16i32 + ; VBITS_GE_512: ptrue [[P0:p[0-9]+]].s, vl16 + ; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0] + ; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[P0]], [x8] + ; VBITS_GE_512-NEXT: ret + + ; Ensure sensible type legalistaion + ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 + ; VBITS_EQ_256-DAG: ld1h { [[Z0:z[0-9]+]].h }, [[PG]]/z, [x0] + ; VBITS_EQ_256-DAG: mov x9, sp + ; VBITS_EQ_256-DAG: st1h { [[Z0]].h }, [[PG]], [x9] + ; VBITS_EQ_256-DAG: ldp q[[R0:[0-9]+]], q[[R1:[0-9]+]], [sp] + ; VBITS_EQ_256-DAG: add x9, x8, #32 + ; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8 + ; VBITS_EQ_256-DAG: uunpklo z[[R0]].s, z[[R0]].h + ; VBITS_EQ_256-DAG: uunpklo z[[R1]].s, z[[R1]].h + ; VBITS_EQ_256-DAG: st1w { z[[R1]].s }, [[PG1]], [x9] + ; VBITS_EQ_256-DAG: st1w { z[[R0]].s }, [[PG1]], [x8] + ; VBITS_EQ_256-DAG: ret + %a = load <16 x i16>, <16 x i16>* %ap + %val = zext <16 x i16> %a to <16 x i32> + ret <16 x i32> %val +} + +define <32 x i32> @load_zext_v32i16i32(<32 x i16>* %ap) #0 { + ; CHECK-LABEL: load_zext_v32i16i32 + ; VBITS_GE_1024: ptrue [[P0:p[0-9]+]].s, vl32 + ; VBITS_GE_1024-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0] + ; VBITS_GE_1024-NEXT: st1w { [[Z0]].s }, [[P0]], [x8] + ; VBITS_GE_1024-NEXT: ret + %a = load <32 x i16>, <32 x i16>* %ap + %val = zext <32 x i16> %a to <32 x i32> + ret <32 x i32> %val +} + +define <64 x i32> @load_zext_v64i16i32(<64 x i16>* %ap) #0 { + ; CHECK-LABEL: load_zext_v64i16i32 + ; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].s, vl64 + ; VBITS_GE_2048-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0] + ; VBITS_GE_2048-NEXT: st1w { [[Z0]].s }, [[P0]], [x8] + ; VBITS_GE_2048-NEXT: ret + %a = load <64 x i16>, <64 x i16>* %ap + %val = zext <64 x i16> %a to <64 x i32> + ret <64 x i32> %val +} + +define <4 x i32> @load_sext_v4i16i32(<4 x i16>* %ap) #0 { + ; CHECK-LABEL: load_sext_v4i16i32 + ; CHECK: ldr d[[D0:[0-9]+]], [x0] + ; CHECK-NEXT: sshll v[[D0]].4s, v[[D0]].4h, #0 + ; CHECK-NEXT: ret + %a = load <4 x i16>, <4 x i16>* %ap + %val = sext <4 x i16> %a to <4 x i32> + ret <4 x i32> %val +} + +define <8 x i32> @load_sext_v8i16i32(<8 x i16>* %ap) #0 { + ; CHECK-LABEL: load_sext_v8i16i32 + ; CHECK: ptrue [[P0:p[0-9]+]].s, vl8 + ; CHECK-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0] + ; CHECK-NEXT: st1w { [[Z0]].s }, [[P0]], [x8] + ; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %ap + %val = sext <8 x i16> %a to <8 x i32> + ret <8 x i32> %val +} + +define <16 x i32> @load_sext_v16i16i32(<16 x i16>* %ap) #0 { + ; CHECK-LABEL: load_sext_v16i16i32 + ; VBITS_GE_512: ptrue [[P0:p[0-9]+]].s, vl16 + ; VBITS_GE_512-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0] + ; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[P0]], [x8] + ; VBITS_GE_512-NEXT: ret + + ; Ensure sensible type legalistaion + ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 + ; VBITS_EQ_256-DAG: ld1h { [[Z0:z[0-9]+]].h }, [[PG]]/z, [x0] + ; VBITS_EQ_256-DAG: mov x9, sp + ; VBITS_EQ_256-DAG: st1h { [[Z0]].h }, [[PG]], [x9] + ; VBITS_EQ_256-DAG: ldp q[[R0:[0-9]+]], q[[R1:[0-9]+]], [sp] + ; VBITS_EQ_256-DAG: add x9, x8, #32 + ; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8 + ; VBITS_EQ_256-DAG: sunpklo z[[R0]].s, z[[R0]].h + ; VBITS_EQ_256-DAG: sunpklo z[[R1]].s, z[[R1]].h + ; VBITS_EQ_256-DAG: st1w { z[[R1]].s }, [[PG1]], [x9] + ; VBITS_EQ_256-DAG: st1w { z[[R0]].s }, [[PG1]], [x8] + ; VBITS_EQ_256-DAG: ret + %a = load <16 x i16>, <16 x i16>* %ap + %val = sext <16 x i16> %a to <16 x i32> + ret <16 x i32> %val +} + +define <32 x i32> @load_sext_v32i16i32(<32 x i16>* %ap) #0 { + ; CHECK-LABEL: load_sext_v32i16i32 + ; VBITS_GE_1024: ptrue [[P0:p[0-9]+]].s, vl32 + ; VBITS_GE_1024-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0] + ; VBITS_GE_1024-NEXT: st1w { [[Z0]].s }, [[P0]], [x8] + ; VBITS_GE_1024-NEXT: ret + %a = load <32 x i16>, <32 x i16>* %ap + %val = sext <32 x i16> %a to <32 x i32> + ret <32 x i32> %val +} + +define <64 x i32> @load_sext_v64i16i32(<64 x i16>* %ap) #0 { + ; CHECK-LABEL: load_sext_v64i16i32 + ; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].s, vl64 + ; VBITS_GE_2048-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0] + ; VBITS_GE_2048-NEXT: st1w { [[Z0]].s }, [[P0]], [x8] + ; VBITS_GE_2048-NEXT: ret + %a = load <64 x i16>, <64 x i16>* %ap + %val = sext <64 x i16> %a to <64 x i32> + ret <64 x i32> %val +} + +define <32 x i64> @load_zext_v32i8i64(<32 x i8>* %ap) #0 { + ; CHECK-LABEL: load_zext_v32i8i64 + ; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32 + ; VBITS_GE_2048-NEXT: ld1b { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0] + ; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8] + ; VBITS_GE_2048-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %ap + %val = zext <32 x i8> %a to <32 x i64> + ret <32 x i64> %val +} + +define <32 x i64> @load_sext_v32i8i64(<32 x i8>* %ap) #0 { + ; CHECK-LABEL: load_sext_v32i8i64 + ; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32 + ; VBITS_GE_2048-NEXT: ld1sb { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0] + ; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8] + ; VBITS_GE_2048-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %ap + %val = sext <32 x i8> %a to <32 x i64> + ret <32 x i64> %val +} + +define <32 x i64> @load_zext_v32i16i64(<32 x i16>* %ap) #0 { + ; CHECK-LABEL: load_zext_v32i16i64 + ; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32 + ; VBITS_GE_2048-NEXT: ld1h { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0] + ; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8] + ; VBITS_GE_2048-NEXT: ret + %a = load <32 x i16>, <32 x i16>* %ap + %val = zext <32 x i16> %a to <32 x i64> + ret <32 x i64> %val +} + +define <32 x i64> @load_sext_v32i16i64(<32 x i16>* %ap) #0 { + ; CHECK-LABEL: load_sext_v32i16i64 + ; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32 + ; VBITS_GE_2048-NEXT: ld1sh { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0] + ; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8] + ; VBITS_GE_2048-NEXT: ret + %a = load <32 x i16>, <32 x i16>* %ap + %val = sext <32 x i16> %a to <32 x i64> + ret <32 x i64> %val +} + +define <32 x i64> @load_zext_v32i32i64(<32 x i32>* %ap) #0 { + ; CHECK-LABEL: load_zext_v32i32i64 + ; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32 + ; VBITS_GE_2048-NEXT: ld1w { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0] + ; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8] + ; VBITS_GE_2048-NEXT: ret + %a = load <32 x i32>, <32 x i32>* %ap + %val = zext <32 x i32> %a to <32 x i64> + ret <32 x i64> %val +} + +define <32 x i64> @load_sext_v32i32i64(<32 x i32>* %ap) #0 { + ; CHECK-LABEL: load_sext_v32i32i64 + ; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32 + ; VBITS_GE_2048-NEXT: ld1sw { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0] + ; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8] + ; VBITS_GE_2048-NEXT: ret + %a = load <32 x i32>, <32 x i32>* %ap + %val = sext <32 x i32> %a to <32 x i64> + ret <32 x i64> %val +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll index 48bae1bb813a..2af8926cd1e3 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll @@ -1,11 +1,11 @@ ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE -; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256,VBITS_EQ_256 ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256 -; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256,VBITS_EQ_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256 -; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256,VBITS_EQ_1024 ; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256 @@ -75,12 +75,19 @@ define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { ; CHECK-LABEL: smulh_v32i8: -; VBITS_GE_256: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]] -; VBITS_GE_256-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] -; VBITS_GE_256-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] -; VBITS_GE_256: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b -; VBITS_GE_256: st1b { [[RES]].b }, [[PG]], [x0] -; VBITS_GE_256: ret +; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]] +; VBITS_EQ_256-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] +; VBITS_EQ_256: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b +; VBITS_EQ_256: ret + +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]] +; VBITS_GE_512-DAG: ld1sb { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_512-DAG: ld1sb { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_512: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; VBITS_GE_512: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8 +; VBITS_GE_512: st1b { [[RES]].h }, [[PG]], [x0] +; VBITS_GE_512: ret %op1 = load <32 x i8>, <32 x i8>* %a %op2 = load <32 x i8>, <32 x i8>* %b %insert = insertelement <32 x i16> undef, i16 8, i64 0 @@ -96,12 +103,19 @@ define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { ; CHECK-LABEL: smulh_v64i8: -; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]] -; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] -; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] -; VBITS_GE_512: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b -; VBITS_GE_512: st1b { [[RES]].b }, [[PG]], [x0] -; VBITS_GE_512: ret +; VBITS_EQ_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] +; VBITS_EQ_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] +; VBITS_EQ_512: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b +; VBITS_EQ_512: st1b { [[RES]].b }, [[PG]], [x0] +; VBITS_EQ_512: ret + +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]] +; VBITS_GE_1024-DAG: ld1sb { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_1024-DAG: ld1sb { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_1024: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; VBITS_GE_1024: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8 +; VBITS_GE_1024: st1b { [[RES]].h }, [[PG]], [x0] +; VBITS_GE_1024: ret %op1 = load <64 x i8>, <64 x i8>* %a %op2 = load <64 x i8>, <64 x i8>* %b %insert = insertelement <64 x i16> undef, i16 8, i64 0 @@ -117,12 +131,19 @@ define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { define void @smulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { ; CHECK-LABEL: smulh_v128i8: -; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]] -; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] -; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] -; VBITS_GE_1024: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b -; VBITS_GE_1024: st1b { [[RES]].b }, [[PG]], [x0] -; VBITS_GE_1024: ret +; VBITS_EQ_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] +; VBITS_EQ_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] +; VBITS_EQ_1024: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b +; VBITS_EQ_1024: st1b { [[RES]].b }, [[PG]], [x0] +; VBITS_EQ_1024: ret + +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,128)]] +; VBITS_GE_2048-DAG: ld1sb { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_2048-DAG: ld1sb { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_2048: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; VBITS_GE_2048: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8 +; VBITS_GE_2048: st1b { [[RES]].h }, [[PG]], [x0] +; VBITS_GE_2048: ret %op1 = load <128 x i8>, <128 x i8>* %a %op2 = load <128 x i8>, <128 x i8>* %b %insert = insertelement <128 x i16> undef, i16 8, i64 0 @@ -198,12 +219,20 @@ define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { ; CHECK-LABEL: smulh_v16i16: -; VBITS_GE_256: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,16)]] -; VBITS_GE_256-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] -; VBITS_GE_256-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] -; VBITS_GE_256: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h -; VBITS_GE_256: st1h { [[RES]].h }, [[PG]], [x0] -; VBITS_GE_256: ret +; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,16)]] +; VBITS_EQ_256-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_EQ_256: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; VBITS_EQ_256: st1h { [[RES]].h }, [[PG]], [x0] +; VBITS_EQ_256: ret + +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]] +; VBITS_GE_512-DAG: ld1sh { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_512-DAG: ld1sh { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_512: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; VBITS_GE_512: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16 +; VBITS_GE_512: st1h { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_512: ret %op1 = load <16 x i16>, <16 x i16>* %a %op2 = load <16 x i16>, <16 x i16>* %b %insert = insertelement <16 x i32> undef, i32 16, i64 0 @@ -219,12 +248,20 @@ define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { define void @smulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { ; CHECK-LABEL: smulh_v32i16: -; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]] -; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] -; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] -; VBITS_GE_512: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h -; VBITS_GE_512: st1h { [[RES]].h }, [[PG]], [x0] -; VBITS_GE_512: ret +; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]] +; VBITS_EQ_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_EQ_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_EQ_512: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; VBITS_EQ_512: st1h { [[RES]].h }, [[PG]], [x0] +; VBITS_EQ_512: ret + +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]] +; VBITS_GE_1024-DAG: ld1sh { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_1024-DAG: ld1sh { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_1024: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; VBITS_GE_1024: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16 +; VBITS_GE_1024: st1h { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_1024: ret %op1 = load <32 x i16>, <32 x i16>* %a %op2 = load <32 x i16>, <32 x i16>* %b %insert = insertelement <32 x i32> undef, i32 16, i64 0 @@ -240,12 +277,20 @@ define void @smulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { define void @smulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { ; CHECK-LABEL: smulh_v64i16: -; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]] -; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] -; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] -; VBITS_GE_1024: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h -; VBITS_GE_1024: st1h { [[RES]].h }, [[PG]], [x0] -; VBITS_GE_1024: ret +; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]] +; VBITS_EQ_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_EQ_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_EQ_1024: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; VBITS_EQ_1024: st1h { [[RES]].h }, [[PG]], [x0] +; VBITS_EQ_1024: ret + +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,64)]] +; VBITS_GE_2048-DAG: ld1sh { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_2048-DAG: ld1sh { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_2048: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; VBITS_GE_2048: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16 +; VBITS_GE_2048: st1h { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_2048: ret %op1 = load <64 x i16>, <64 x i16>* %a %op2 = load <64 x i16>, <64 x i16>* %b %insert = insertelement <64 x i32> undef, i32 16, i64 0 @@ -318,12 +363,20 @@ define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { ; CHECK-LABEL: smulh_v8i32: -; VBITS_GE_256: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,8)]] -; VBITS_GE_256-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] -; VBITS_GE_256-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] -; VBITS_GE_256: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s -; VBITS_GE_256: st1w { [[RES]].s }, [[PG]], [x0] -; VBITS_GE_256: ret +; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,8)]] +; VBITS_EQ_256-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_EQ_256: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; VBITS_EQ_256: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_EQ_256: ret + +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,8)]] +; VBITS_GE_512-DAG: ld1sw { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_512-DAG: ld1sw { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_512: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d +; VBITS_GE_512: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32 +; VBITS_GE_512: st1w { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_512: ret %op1 = load <8 x i32>, <8 x i32>* %a %op2 = load <8 x i32>, <8 x i32>* %b %insert = insertelement <8 x i64> undef, i64 32, i64 0 @@ -339,12 +392,19 @@ define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { define void @smulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { ; CHECK-LABEL: smulh_v16i32: -; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]] -; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] -; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] -; VBITS_GE_512: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s -; VBITS_GE_512: st1w { [[RES]].s }, [[PG]], [x0] -; VBITS_GE_512: ret +; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]] +; VBITS_EQ_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_EQ_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_EQ_512: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; VBITS_EQ_512: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_EQ_512: ret + +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,16)]] +; VBITS_GE_1024-DAG: ld1sw { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_1024-DAG: ld1sw { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_1024: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d +; VBITS_GE_1024: st1w { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_1024: ret %op1 = load <16 x i32>, <16 x i32>* %a %op2 = load <16 x i32>, <16 x i32>* %b %insert = insertelement <16 x i64> undef, i64 32, i64 0 @@ -360,12 +420,20 @@ define void @smulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { define void @smulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { ; CHECK-LABEL: smulh_v32i32: -; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]] -; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] -; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] -; VBITS_GE_1024: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s -; VBITS_GE_1024: st1w { [[RES]].s }, [[PG]], [x0] -; VBITS_GE_1024: ret +; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]] +; VBITS_EQ_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_EQ_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_EQ_1024: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; VBITS_EQ_1024: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_EQ_1024: ret + +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,32)]] +; VBITS_GE_2048-DAG: ld1sw { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_2048-DAG: ld1sw { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_2048: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d +; VBITS_GE_2048: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32 +; VBITS_GE_2048: st1w { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_2048: ret %op1 = load <32 x i32>, <32 x i32>* %a %op2 = load <32 x i32>, <32 x i32>* %b %insert = insertelement <32 x i64> undef, i64 32, i64 0 @@ -563,12 +631,20 @@ define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { ; CHECK-LABEL: umulh_v32i8: -; VBITS_GE_256: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]] -; VBITS_GE_256-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] -; VBITS_GE_256-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] -; VBITS_GE_256: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b -; VBITS_GE_256: st1b { [[RES]].b }, [[PG]], [x0] -; VBITS_GE_256: ret +; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]] +; VBITS_EQ_256-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] +; VBITS_EQ_256: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b +; VBITS_EQ_256: st1b { [[RES]].b }, [[PG]], [x0] +; VBITS_EQ_256: ret + +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]] +; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_512: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; VBIGS_GE_512: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8 +; VBITS_GE_512: st1b { [[RES]].h }, [[PG]], [x0] +; VBITS_GE_512: ret %op1 = load <32 x i8>, <32 x i8>* %a %op2 = load <32 x i8>, <32 x i8>* %b %insert = insertelement <32 x i16> undef, i16 8, i64 0 @@ -584,12 +660,18 @@ define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { define void @umulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { ; CHECK-LABEL: umulh_v64i8: -; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]] -; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] -; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] -; VBITS_GE_512: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b -; VBITS_GE_512: st1b { [[RES]].b }, [[PG]], [x0] -; VBITS_GE_512: ret +; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]] +; VBITS_EQ_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] +; VBITS_EQ_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] +; VBITS_EQ_512: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b +; VBITS_EQ_512: ret + +; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_1024: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; VBIGS_GE_1024: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8 +; VBITS_GE_1024: st1b { [[RES]].h }, [[PG]], [x0] +; VBITS_GE_1024: ret %op1 = load <64 x i8>, <64 x i8>* %a %op2 = load <64 x i8>, <64 x i8>* %b %insert = insertelement <64 x i16> undef, i16 8, i64 0 @@ -605,12 +687,20 @@ define void @umulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { define void @umulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { ; CHECK-LABEL: umulh_v128i8: -; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]] -; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] -; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] -; VBITS_GE_1024: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b -; VBITS_GE_1024: st1b { [[RES]].b }, [[PG]], [x0] -; VBITS_GE_1024: ret +; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]] +; VBITS_EQ_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] +; VBITS_EQ_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] +; VBITS_EQ_1024: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b +; VBITS_EQ_1024: st1b { [[RES]].b }, [[PG]], [x0] +; VBITS_EQ_1024: ret + +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,128)]] +; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_2048: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; VBIGS_GE_2048: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8 +; VBITS_GE_2048: st1b { [[RES]].h }, [[PG]], [x0] +; VBITS_GE_2048: ret %op1 = load <128 x i8>, <128 x i8>* %a %op2 = load <128 x i8>, <128 x i8>* %b %insert = insertelement <128 x i16> undef, i16 8, i64 0 @@ -686,12 +776,20 @@ define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { ; CHECK-LABEL: umulh_v16i16: -; VBITS_GE_256: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,16)]] -; VBITS_GE_256-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] -; VBITS_GE_256-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] -; VBITS_GE_256: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h -; VBITS_GE_256: st1h { [[RES]].h }, [[PG]], [x0] -; VBITS_GE_256: ret +; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,16)]] +; VBITS_EQ_256-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_EQ_256: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; VBITS_EQ_256: st1h { [[RES]].h }, [[PG]], [x0] +; VBITS_EQ_256: ret + +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]] +; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_512: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; VBITS_GE_512: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16 +; VBITS_GE_512: st1h { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_512: ret %op1 = load <16 x i16>, <16 x i16>* %a %op2 = load <16 x i16>, <16 x i16>* %b %insert = insertelement <16 x i32> undef, i32 16, i64 0 @@ -707,12 +805,20 @@ define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { define void @umulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { ; CHECK-LABEL: umulh_v32i16: -; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]] -; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] -; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] -; VBITS_GE_512: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h -; VBITS_GE_512: st1h { [[RES]].h }, [[PG]], [x0] -; VBITS_GE_512: ret +; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]] +; VBITS_EQ_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_EQ_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_EQ_512: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; VBITS_EQ_512: st1h { [[RES]].h }, [[PG]], [x0] +; VBITS_EQ_512: ret + +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]] +; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_1024: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; VBITS_GE_1024: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16 +; VBITS_GE_1024: st1h { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_1024: ret %op1 = load <32 x i16>, <32 x i16>* %a %op2 = load <32 x i16>, <32 x i16>* %b %insert = insertelement <32 x i32> undef, i32 16, i64 0 @@ -728,12 +834,19 @@ define void @umulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { define void @umulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { ; CHECK-LABEL: umulh_v64i16: -; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]] -; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] -; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] -; VBITS_GE_1024: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h -; VBITS_GE_1024: st1h { [[RES]].h }, [[PG]], [x0] -; VBITS_GE_1024: ret +; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]] +; VBITS_EQ_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_EQ_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_EQ_1024: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; VBITS_EQ_1024: ret + +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,64)]] +; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_2048: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; VBITS_GE_2048: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16 +; VBITS_GE_2048: st1h { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_2048: ret %op1 = load <64 x i16>, <64 x i16>* %a %op2 = load <64 x i16>, <64 x i16>* %b %insert = insertelement <64 x i32> undef, i32 16, i64 0 @@ -806,12 +919,20 @@ define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { ; CHECK-LABEL: umulh_v8i32: -; VBITS_GE_256: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,8)]] -; VBITS_GE_256-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] -; VBITS_GE_256-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] -; VBITS_GE_256: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s -; VBITS_GE_256: st1w { [[RES]].s }, [[PG]], [x0] -; VBITS_GE_256: ret +; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,8)]] +; VBITS_EQ_256-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_EQ_256: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; VBITS_EQ_256: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_EQ_256: ret + +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,8)]] +; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_512: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d +; VBITS_GE_512: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32 +; VBITS_GE_512: st1w { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_512: ret %op1 = load <8 x i32>, <8 x i32>* %a %op2 = load <8 x i32>, <8 x i32>* %b %insert = insertelement <8 x i64> undef, i64 32, i64 0 @@ -827,12 +948,19 @@ define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { define void @umulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { ; CHECK-LABEL: umulh_v16i32: -; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]] -; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] -; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] -; VBITS_GE_512: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s -; VBITS_GE_512: st1w { [[RES]].s }, [[PG]], [x0] -; VBITS_GE_512: ret +; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]] +; VBITS_EQ_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_EQ_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_EQ_512: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; VBITS_EQ_512: st1w { [[RES]].s }, [[PG]], [x0] + +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,16)]] +; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_1024: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d +; VBITS_GE_1024: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32 +; VBITS_GE_1024: st1w { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_1024: ret %op1 = load <16 x i32>, <16 x i32>* %a %op2 = load <16 x i32>, <16 x i32>* %b %insert = insertelement <16 x i64> undef, i64 32, i64 0 @@ -848,12 +976,20 @@ define void @umulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { define void @umulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { ; CHECK-LABEL: umulh_v32i32: -; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]] -; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] -; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] -; VBITS_GE_1024: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s -; VBITS_GE_1024: st1w { [[RES]].s }, [[PG]], [x0] -; VBITS_GE_1024: ret +; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]] +; VBITS_EQ_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_EQ_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_EQ_1024: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; VBITS_EQ_1024: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_EQ_1024: ret + +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,32)]] +; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_2048: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d +; VBITS_GE_2048: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32 +; VBITS_GE_2048: st1w { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_2048: ret %op1 = load <32 x i32>, <32 x i32>* %a %op2 = load <32 x i32>, <32 x i32>* %b %insert = insertelement <32 x i64> undef, i64 32, i64 0