[AArch64][SVE] Add SVE2 intrinsics for pairwise arithmetic

Summary:
Implements the following intrinsics:
 - addp
 - smaxp, sminp, umaxp & uminp
 - sadalp & uadalp

Reviewers: dancgr, efriedma, sdesmalen, c-rhodes

Reviewed By: c-rhodes

Subscribers: tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, cameron.mcinally, cfe-commits, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D73347
This commit is contained in:
Kerry McLaughlin 2020-01-29 09:57:30 +00:00
parent 7116e431c0
commit bd33a46213
5 changed files with 388 additions and 38 deletions

View File

@ -1032,6 +1032,13 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
LLVMVectorOfBitcastsToInt<0>], LLVMVectorOfBitcastsToInt<0>],
[IntrNoMem]>; [IntrNoMem]>;
class SVE2_2VectorArg_Pred_Long_Intrinsic
: Intrinsic<[llvm_anyvector_ty],
[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
LLVMMatchType<0>,
LLVMSubdivide2VectorType<0>],
[IntrNoMem]>;
class SVE2_3VectorArg_Long_Intrinsic class SVE2_3VectorArg_Long_Intrinsic
: Intrinsic<[llvm_anyvector_ty], : Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, [LLVMMatchType<0>,
@ -1662,11 +1669,23 @@ def int_aarch64_sve_st1_scatter_scalar_offset : AdvSIMD_ScatterStore_VectorBase_
// SVE2 - Non-widening pairwise arithmetic // SVE2 - Non-widening pairwise arithmetic
// //
def int_aarch64_sve_addp : AdvSIMD_Pred2VectorArg_Intrinsic;
def int_aarch64_sve_faddp : AdvSIMD_Pred2VectorArg_Intrinsic; def int_aarch64_sve_faddp : AdvSIMD_Pred2VectorArg_Intrinsic;
def int_aarch64_sve_fmaxp : AdvSIMD_Pred2VectorArg_Intrinsic; def int_aarch64_sve_fmaxp : AdvSIMD_Pred2VectorArg_Intrinsic;
def int_aarch64_sve_fmaxnmp : AdvSIMD_Pred2VectorArg_Intrinsic; def int_aarch64_sve_fmaxnmp : AdvSIMD_Pred2VectorArg_Intrinsic;
def int_aarch64_sve_fminp : AdvSIMD_Pred2VectorArg_Intrinsic; def int_aarch64_sve_fminp : AdvSIMD_Pred2VectorArg_Intrinsic;
def int_aarch64_sve_fminnmp : AdvSIMD_Pred2VectorArg_Intrinsic; def int_aarch64_sve_fminnmp : AdvSIMD_Pred2VectorArg_Intrinsic;
def int_aarch64_sve_smaxp : AdvSIMD_Pred2VectorArg_Intrinsic;
def int_aarch64_sve_sminp : AdvSIMD_Pred2VectorArg_Intrinsic;
def int_aarch64_sve_umaxp : AdvSIMD_Pred2VectorArg_Intrinsic;
def int_aarch64_sve_uminp : AdvSIMD_Pred2VectorArg_Intrinsic;
//
// SVE2 - Widening pairwise arithmetic
//
def int_aarch64_sve_sadalp : SVE2_2VectorArg_Pred_Long_Intrinsic;
def int_aarch64_sve_uadalp : SVE2_2VectorArg_Pred_Long_Intrinsic;
// //
// SVE2 - Floating-point widening multiply-accumulate // SVE2 - Floating-point widening multiply-accumulate

View File

@ -1503,25 +1503,25 @@ let Predicates = [HasSVE2] in {
defm SQDMLSLBT_ZZZ : sve2_int_mla_long<0b00011, "sqdmlslbt">; defm SQDMLSLBT_ZZZ : sve2_int_mla_long<0b00011, "sqdmlslbt">;
// SVE2 integer halving add/subtract (predicated) // SVE2 integer halving add/subtract (predicated)
defm SHADD_ZPmZ : sve2_int_arith_pred<0b100000, "shadd">; defm SHADD_ZPmZ : sve2_int_arith_pred<0b100000, "shadd", null_frag>;
defm UHADD_ZPmZ : sve2_int_arith_pred<0b100010, "uhadd">; defm UHADD_ZPmZ : sve2_int_arith_pred<0b100010, "uhadd", null_frag>;
defm SHSUB_ZPmZ : sve2_int_arith_pred<0b100100, "shsub">; defm SHSUB_ZPmZ : sve2_int_arith_pred<0b100100, "shsub", null_frag>;
defm UHSUB_ZPmZ : sve2_int_arith_pred<0b100110, "uhsub">; defm UHSUB_ZPmZ : sve2_int_arith_pred<0b100110, "uhsub", null_frag>;
defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd">; defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd", null_frag>;
defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd">; defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd", null_frag>;
defm SHSUBR_ZPmZ : sve2_int_arith_pred<0b101100, "shsubr">; defm SHSUBR_ZPmZ : sve2_int_arith_pred<0b101100, "shsubr", null_frag>;
defm UHSUBR_ZPmZ : sve2_int_arith_pred<0b101110, "uhsubr">; defm UHSUBR_ZPmZ : sve2_int_arith_pred<0b101110, "uhsubr", null_frag>;
// SVE2 integer pairwise add and accumulate long // SVE2 integer pairwise add and accumulate long
defm SADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<0, "sadalp">; defm SADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<0, "sadalp", int_aarch64_sve_sadalp>;
defm UADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<1, "uadalp">; defm UADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<1, "uadalp", int_aarch64_sve_uadalp>;
// SVE2 integer pairwise arithmetic // SVE2 integer pairwise arithmetic
defm ADDP_ZPmZ : sve2_int_arith_pred<0b100011, "addp">; defm ADDP_ZPmZ : sve2_int_arith_pred<0b100011, "addp", int_aarch64_sve_addp>;
defm SMAXP_ZPmZ : sve2_int_arith_pred<0b101001, "smaxp">; defm SMAXP_ZPmZ : sve2_int_arith_pred<0b101001, "smaxp", int_aarch64_sve_smaxp>;
defm UMAXP_ZPmZ : sve2_int_arith_pred<0b101011, "umaxp">; defm UMAXP_ZPmZ : sve2_int_arith_pred<0b101011, "umaxp", int_aarch64_sve_umaxp>;
defm SMINP_ZPmZ : sve2_int_arith_pred<0b101101, "sminp">; defm SMINP_ZPmZ : sve2_int_arith_pred<0b101101, "sminp", int_aarch64_sve_sminp>;
defm UMINP_ZPmZ : sve2_int_arith_pred<0b101111, "uminp">; defm UMINP_ZPmZ : sve2_int_arith_pred<0b101111, "uminp", int_aarch64_sve_uminp>;
// SVE2 integer unary operations (predicated) // SVE2 integer unary operations (predicated)
defm URECPE_ZPmZ : sve2_int_un_pred_arit_s<0b000, "urecpe">; defm URECPE_ZPmZ : sve2_int_un_pred_arit_s<0b000, "urecpe">;
@ -1530,28 +1530,28 @@ let Predicates = [HasSVE2] in {
defm SQNEG_ZPmZ : sve2_int_un_pred_arit<0b101, "sqneg">; defm SQNEG_ZPmZ : sve2_int_un_pred_arit<0b101, "sqneg">;
// SVE2 saturating add/subtract // SVE2 saturating add/subtract
defm SQADD_ZPmZ : sve2_int_arith_pred<0b110000, "sqadd">; defm SQADD_ZPmZ : sve2_int_arith_pred<0b110000, "sqadd", null_frag>;
defm UQADD_ZPmZ : sve2_int_arith_pred<0b110010, "uqadd">; defm UQADD_ZPmZ : sve2_int_arith_pred<0b110010, "uqadd", null_frag>;
defm SQSUB_ZPmZ : sve2_int_arith_pred<0b110100, "sqsub">; defm SQSUB_ZPmZ : sve2_int_arith_pred<0b110100, "sqsub", null_frag>;
defm UQSUB_ZPmZ : sve2_int_arith_pred<0b110110, "uqsub">; defm UQSUB_ZPmZ : sve2_int_arith_pred<0b110110, "uqsub", null_frag>;
defm SUQADD_ZPmZ : sve2_int_arith_pred<0b111000, "suqadd">; defm SUQADD_ZPmZ : sve2_int_arith_pred<0b111000, "suqadd", null_frag>;
defm USQADD_ZPmZ : sve2_int_arith_pred<0b111010, "usqadd">; defm USQADD_ZPmZ : sve2_int_arith_pred<0b111010, "usqadd", null_frag>;
defm SQSUBR_ZPmZ : sve2_int_arith_pred<0b111100, "sqsubr">; defm SQSUBR_ZPmZ : sve2_int_arith_pred<0b111100, "sqsubr", null_frag>;
defm UQSUBR_ZPmZ : sve2_int_arith_pred<0b111110, "uqsubr">; defm UQSUBR_ZPmZ : sve2_int_arith_pred<0b111110, "uqsubr", null_frag>;
// SVE2 saturating/rounding bitwise shift left (predicated) // SVE2 saturating/rounding bitwise shift left (predicated)
defm SRSHL_ZPmZ : sve2_int_arith_pred<0b000100, "srshl">; defm SRSHL_ZPmZ : sve2_int_arith_pred<0b000100, "srshl", null_frag>;
defm URSHL_ZPmZ : sve2_int_arith_pred<0b000110, "urshl">; defm URSHL_ZPmZ : sve2_int_arith_pred<0b000110, "urshl", null_frag>;
defm SRSHLR_ZPmZ : sve2_int_arith_pred<0b001100, "srshlr">; defm SRSHLR_ZPmZ : sve2_int_arith_pred<0b001100, "srshlr", null_frag>;
defm URSHLR_ZPmZ : sve2_int_arith_pred<0b001110, "urshlr">; defm URSHLR_ZPmZ : sve2_int_arith_pred<0b001110, "urshlr", null_frag>;
defm SQSHL_ZPmZ : sve2_int_arith_pred<0b010000, "sqshl">; defm SQSHL_ZPmZ : sve2_int_arith_pred<0b010000, "sqshl", null_frag>;
defm UQSHL_ZPmZ : sve2_int_arith_pred<0b010010, "uqshl">; defm UQSHL_ZPmZ : sve2_int_arith_pred<0b010010, "uqshl", null_frag>;
defm SQRSHL_ZPmZ : sve2_int_arith_pred<0b010100, "sqrshl">; defm SQRSHL_ZPmZ : sve2_int_arith_pred<0b010100, "sqrshl", null_frag>;
defm UQRSHL_ZPmZ : sve2_int_arith_pred<0b010110, "uqrshl">; defm UQRSHL_ZPmZ : sve2_int_arith_pred<0b010110, "uqrshl", null_frag>;
defm SQSHLR_ZPmZ : sve2_int_arith_pred<0b011000, "sqshlr">; defm SQSHLR_ZPmZ : sve2_int_arith_pred<0b011000, "sqshlr", null_frag>;
defm UQSHLR_ZPmZ : sve2_int_arith_pred<0b011010, "uqshlr">; defm UQSHLR_ZPmZ : sve2_int_arith_pred<0b011010, "uqshlr", null_frag>;
defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr">; defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr", null_frag>;
defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr">; defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr", null_frag>;
// SVE2 predicated shifts // SVE2 predicated shifts
defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">; defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">;

View File

@ -2716,7 +2716,7 @@ class sve2_int_arith_pred<bits<2> sz, bits<6> opc, string asm,
bits<5> Zdn; bits<5> Zdn;
let Inst{31-24} = 0b01000100; let Inst{31-24} = 0b01000100;
let Inst{23-22} = sz; let Inst{23-22} = sz;
let Inst{21} = 0b0; let Inst{21-20} = 0b01;
let Inst{20-16} = opc{5-1}; let Inst{20-16} = opc{5-1};
let Inst{15-14} = 0b10; let Inst{15-14} = 0b10;
let Inst{13} = opc{0}; let Inst{13} = opc{0};
@ -2729,11 +2729,16 @@ class sve2_int_arith_pred<bits<2> sz, bits<6> opc, string asm,
let ElementSize = zprty.ElementSize; let ElementSize = zprty.ElementSize;
} }
multiclass sve2_int_arith_pred<bits<6> opc, string asm> { multiclass sve2_int_arith_pred<bits<6> opc, string asm, SDPatternOperator op> {
def _B : sve2_int_arith_pred<0b00, opc, asm, ZPR8>; def _B : sve2_int_arith_pred<0b00, opc, asm, ZPR8>;
def _H : sve2_int_arith_pred<0b01, opc, asm, ZPR16>; def _H : sve2_int_arith_pred<0b01, opc, asm, ZPR16>;
def _S : sve2_int_arith_pred<0b10, opc, asm, ZPR32>; def _S : sve2_int_arith_pred<0b10, opc, asm, ZPR32>;
def _D : sve2_int_arith_pred<0b11, opc, asm, ZPR64>; def _D : sve2_int_arith_pred<0b11, opc, asm, ZPR64>;
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
} }
class sve2_int_sadd_long_accum_pairwise<bits<2> sz, bit U, string asm, class sve2_int_sadd_long_accum_pairwise<bits<2> sz, bit U, string asm,
@ -2757,10 +2762,14 @@ class sve2_int_sadd_long_accum_pairwise<bits<2> sz, bit U, string asm,
let ElementSize = zprty1.ElementSize; let ElementSize = zprty1.ElementSize;
} }
multiclass sve2_int_sadd_long_accum_pairwise<bit U, string asm> { multiclass sve2_int_sadd_long_accum_pairwise<bit U, string asm, SDPatternOperator op> {
def _H : sve2_int_sadd_long_accum_pairwise<0b01, U, asm, ZPR16, ZPR8>; def _H : sve2_int_sadd_long_accum_pairwise<0b01, U, asm, ZPR16, ZPR8>;
def _S : sve2_int_sadd_long_accum_pairwise<0b10, U, asm, ZPR32, ZPR16>; def _S : sve2_int_sadd_long_accum_pairwise<0b10, U, asm, ZPR32, ZPR16>;
def _D : sve2_int_sadd_long_accum_pairwise<0b11, U, asm, ZPR64, ZPR32>; def _D : sve2_int_sadd_long_accum_pairwise<0b11, U, asm, ZPR64, ZPR32>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv16i8, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv8i16, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv4i32, !cast<Instruction>(NAME # _D)>;
} }
class sve2_int_un_pred_arit<bits<2> sz, bit Q, bits<2> opc, class sve2_int_un_pred_arit<bits<2> sz, bit Q, bits<2> opc,

View File

@ -1,5 +1,49 @@
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
;
; ADDP
;
define <vscale x 16 x i8> @addp_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: addp_i8:
; CHECK: addp z0.b, p0/m, z0.b, z1.b
; CHECK-NEXT: ret
%out = call <vscale x 16 x i8> @llvm.aarch64.sve.addp.nxv16i8(<vscale x 16 x i1> %pg,
<vscale x 16 x i8> %a,
<vscale x 16 x i8> %b)
ret <vscale x 16 x i8> %out
}
define <vscale x 8 x i16> @addp_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-LABEL: addp_i16:
; CHECK: addp z0.h, p0/m, z0.h, z1.h
; CHECK-NEXT: ret
%out = call <vscale x 8 x i16> @llvm.aarch64.sve.addp.nxv8i16(<vscale x 8 x i1> %pg,
<vscale x 8 x i16> %a,
<vscale x 8 x i16> %b)
ret <vscale x 8 x i16> %out
}
define <vscale x 4 x i32> @addp_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: addp_i32:
; CHECK: addp z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: ret
%out = call <vscale x 4 x i32> @llvm.aarch64.sve.addp.nxv4i32(<vscale x 4 x i1> %pg,
<vscale x 4 x i32> %a,
<vscale x 4 x i32> %b)
ret <vscale x 4 x i32> %out
}
define <vscale x 2 x i64> @addp_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: addp_i64:
; CHECK: addp z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: ret
%out = call <vscale x 2 x i64> @llvm.aarch64.sve.addp.nxv2i64(<vscale x 2 x i1> %pg,
<vscale x 2 x i64> %a,
<vscale x 2 x i64> %b)
ret <vscale x 2 x i64> %out
}
; ;
; FADDP ; FADDP
; ;
@ -170,6 +214,187 @@ define <vscale x 2 x double> @fminnmp_f64(<vscale x 2 x i1> %pg, <vscale x 2 x d
ret <vscale x 2 x double> %out ret <vscale x 2 x double> %out
} }
;
; SMAXP
;
define <vscale x 16 x i8> @smaxp_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: smaxp_i8:
; CHECK: smaxp z0.b, p0/m, z0.b, z1.b
; CHECK-NEXT: ret
%out = call <vscale x 16 x i8> @llvm.aarch64.sve.smaxp.nxv16i8(<vscale x 16 x i1> %pg,
<vscale x 16 x i8> %a,
<vscale x 16 x i8> %b)
ret <vscale x 16 x i8> %out
}
define <vscale x 8 x i16> @smaxp_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-LABEL: smaxp_i16:
; CHECK: smaxp z0.h, p0/m, z0.h, z1.h
; CHECK-NEXT: ret
%out = call <vscale x 8 x i16> @llvm.aarch64.sve.smaxp.nxv8i16(<vscale x 8 x i1> %pg,
<vscale x 8 x i16> %a,
<vscale x 8 x i16> %b)
ret <vscale x 8 x i16> %out
}
define <vscale x 4 x i32> @smaxp_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: smaxp_i32:
; CHECK: smaxp z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: ret
%out = call <vscale x 4 x i32> @llvm.aarch64.sve.smaxp.nxv4i32(<vscale x 4 x i1> %pg,
<vscale x 4 x i32> %a,
<vscale x 4 x i32> %b)
ret <vscale x 4 x i32> %out
}
define <vscale x 2 x i64> @smaxp_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: smaxp_i64:
; CHECK: smaxp z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: ret
%out = call <vscale x 2 x i64> @llvm.aarch64.sve.smaxp.nxv2i64(<vscale x 2 x i1> %pg,
<vscale x 2 x i64> %a,
<vscale x 2 x i64> %b)
ret <vscale x 2 x i64> %out
}
;
; SMINP
;
define <vscale x 16 x i8> @sminp_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: sminp_i8:
; CHECK: sminp z0.b, p0/m, z0.b, z1.b
; CHECK-NEXT: ret
%out = call <vscale x 16 x i8> @llvm.aarch64.sve.sminp.nxv16i8(<vscale x 16 x i1> %pg,
<vscale x 16 x i8> %a,
<vscale x 16 x i8> %b)
ret <vscale x 16 x i8> %out
}
define <vscale x 8 x i16> @sminp_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-LABEL: sminp_i16:
; CHECK: sminp z0.h, p0/m, z0.h, z1.h
; CHECK-NEXT: ret
%out = call <vscale x 8 x i16> @llvm.aarch64.sve.sminp.nxv8i16(<vscale x 8 x i1> %pg,
<vscale x 8 x i16> %a,
<vscale x 8 x i16> %b)
ret <vscale x 8 x i16> %out
}
define <vscale x 4 x i32> @sminp_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: sminp_i32:
; CHECK: sminp z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: ret
%out = call <vscale x 4 x i32> @llvm.aarch64.sve.sminp.nxv4i32(<vscale x 4 x i1> %pg,
<vscale x 4 x i32> %a,
<vscale x 4 x i32> %b)
ret <vscale x 4 x i32> %out
}
define <vscale x 2 x i64> @sminp_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: sminp_i64:
; CHECK: sminp z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: ret
%out = call <vscale x 2 x i64> @llvm.aarch64.sve.sminp.nxv2i64(<vscale x 2 x i1> %pg,
<vscale x 2 x i64> %a,
<vscale x 2 x i64> %b)
ret <vscale x 2 x i64> %out
}
;
; UMINP
;
define <vscale x 16 x i8> @uminp_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: uminp_i8:
; CHECK: uminp z0.b, p0/m, z0.b, z1.b
; CHECK-NEXT: ret
%out = call <vscale x 16 x i8> @llvm.aarch64.sve.uminp.nxv16i8(<vscale x 16 x i1> %pg,
<vscale x 16 x i8> %a,
<vscale x 16 x i8> %b)
ret <vscale x 16 x i8> %out
}
define <vscale x 8 x i16> @uminp_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-LABEL: uminp_i16:
; CHECK: uminp z0.h, p0/m, z0.h, z1.h
; CHECK-NEXT: ret
%out = call <vscale x 8 x i16> @llvm.aarch64.sve.uminp.nxv8i16(<vscale x 8 x i1> %pg,
<vscale x 8 x i16> %a,
<vscale x 8 x i16> %b)
ret <vscale x 8 x i16> %out
}
define <vscale x 4 x i32> @uminp_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: uminp_i32:
; CHECK: uminp z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: ret
%out = call <vscale x 4 x i32> @llvm.aarch64.sve.uminp.nxv4i32(<vscale x 4 x i1> %pg,
<vscale x 4 x i32> %a,
<vscale x 4 x i32> %b)
ret <vscale x 4 x i32> %out
}
define <vscale x 2 x i64> @uminp_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: uminp_i64:
; CHECK: uminp z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: ret
%out = call <vscale x 2 x i64> @llvm.aarch64.sve.uminp.nxv2i64(<vscale x 2 x i1> %pg,
<vscale x 2 x i64> %a,
<vscale x 2 x i64> %b)
ret <vscale x 2 x i64> %out
}
;
; UMAXP
;
define <vscale x 16 x i8> @umaxp_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: umaxp_i8:
; CHECK: umaxp z0.b, p0/m, z0.b, z1.b
; CHECK-NEXT: ret
%out = call <vscale x 16 x i8> @llvm.aarch64.sve.umaxp.nxv16i8(<vscale x 16 x i1> %pg,
<vscale x 16 x i8> %a,
<vscale x 16 x i8> %b)
ret <vscale x 16 x i8> %out
}
define <vscale x 8 x i16> @umaxp_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-LABEL: umaxp_i16:
; CHECK: umaxp z0.h, p0/m, z0.h, z1.h
; CHECK-NEXT: ret
%out = call <vscale x 8 x i16> @llvm.aarch64.sve.umaxp.nxv8i16(<vscale x 8 x i1> %pg,
<vscale x 8 x i16> %a,
<vscale x 8 x i16> %b)
ret <vscale x 8 x i16> %out
}
define <vscale x 4 x i32> @umaxp_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: umaxp_i32:
; CHECK: umaxp z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: ret
%out = call <vscale x 4 x i32> @llvm.aarch64.sve.umaxp.nxv4i32(<vscale x 4 x i1> %pg,
<vscale x 4 x i32> %a,
<vscale x 4 x i32> %b)
ret <vscale x 4 x i32> %out
}
define <vscale x 2 x i64> @umaxp_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: umaxp_i64:
; CHECK: umaxp z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: ret
%out = call <vscale x 2 x i64> @llvm.aarch64.sve.umaxp.nxv2i64(<vscale x 2 x i1> %pg,
<vscale x 2 x i64> %a,
<vscale x 2 x i64> %b)
ret <vscale x 2 x i64> %out
}
declare <vscale x 16 x i8> @llvm.aarch64.sve.addp.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 8 x i16> @llvm.aarch64.sve.addp.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.addp.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.addp.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
declare <vscale x 8 x half> @llvm.aarch64.sve.faddp.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>) declare <vscale x 8 x half> @llvm.aarch64.sve.faddp.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
declare <vscale x 4 x float> @llvm.aarch64.sve.faddp.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>) declare <vscale x 4 x float> @llvm.aarch64.sve.faddp.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
declare <vscale x 2 x double> @llvm.aarch64.sve.faddp.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>) declare <vscale x 2 x double> @llvm.aarch64.sve.faddp.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
@ -189,3 +414,23 @@ declare <vscale x 2 x double> @llvm.aarch64.sve.fminp.nxv2f64(<vscale x 2 x i1>,
declare <vscale x 8 x half> @llvm.aarch64.sve.fminnmp.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>) declare <vscale x 8 x half> @llvm.aarch64.sve.fminnmp.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
declare <vscale x 4 x float> @llvm.aarch64.sve.fminnmp.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>) declare <vscale x 4 x float> @llvm.aarch64.sve.fminnmp.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
declare <vscale x 2 x double> @llvm.aarch64.sve.fminnmp.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>) declare <vscale x 2 x double> @llvm.aarch64.sve.fminnmp.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
declare <vscale x 16 x i8> @llvm.aarch64.sve.smaxp.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 8 x i16> @llvm.aarch64.sve.smaxp.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.smaxp.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.smaxp.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
declare <vscale x 16 x i8> @llvm.aarch64.sve.sminp.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 8 x i16> @llvm.aarch64.sve.sminp.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.sminp.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.sminp.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
declare <vscale x 16 x i8> @llvm.aarch64.sve.umaxp.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 8 x i16> @llvm.aarch64.sve.umaxp.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.umaxp.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.umaxp.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
declare <vscale x 16 x i8> @llvm.aarch64.sve.uminp.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 8 x i16> @llvm.aarch64.sve.uminp.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.uminp.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.uminp.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)

View File

@ -0,0 +1,77 @@
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
;
; SADALP
;
define <vscale x 8 x i16> @sadalp_i8(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: sadalp_i8:
; CHECK: sadalp z0.h, p0/m, z1.b
; CHECK-NEXT: ret
%out = call <vscale x 8 x i16> @llvm.aarch64.sve.sadalp.nxv8i16(<vscale x 8 x i1> %pg,
<vscale x 8 x i16> %a,
<vscale x 16 x i8> %b)
ret <vscale x 8 x i16> %out
}
define <vscale x 4 x i32> @sadalp_i16(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
; CHECK-LABEL: sadalp_i16:
; CHECK: sadalp z0.s, p0/m, z1.h
; CHECK-NEXT: ret
%out = call <vscale x 4 x i32> @llvm.aarch64.sve.sadalp.nxv4i32(<vscale x 4 x i1> %pg,
<vscale x 4 x i32> %a,
<vscale x 8 x i16> %b)
ret <vscale x 4 x i32> %out
}
define <vscale x 2 x i64> @sadalp_i32(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: sadalp_i32:
; CHECK: sadalp z0.d, p0/m, z1.s
; CHECK-NEXT: ret
%out = call <vscale x 2 x i64> @llvm.aarch64.sve.sadalp.nxv2i64(<vscale x 2 x i1> %pg,
<vscale x 2 x i64> %a,
<vscale x 4 x i32> %b)
ret <vscale x 2 x i64> %out
}
;
; UADALP
;
define <vscale x 8 x i16> @uadalp_i8(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: uadalp_i8:
; CHECK: uadalp z0.h, p0/m, z1.b
; CHECK-NEXT: ret
%out = call <vscale x 8 x i16> @llvm.aarch64.sve.uadalp.nxv8i16(<vscale x 8 x i1> %pg,
<vscale x 8 x i16> %a,
<vscale x 16 x i8> %b)
ret <vscale x 8 x i16> %out
}
define <vscale x 4 x i32> @uadalp_i16(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
; CHECK-LABEL: uadalp_i16:
; CHECK: uadalp z0.s, p0/m, z1.h
; CHECK-NEXT: ret
%out = call <vscale x 4 x i32> @llvm.aarch64.sve.uadalp.nxv4i32(<vscale x 4 x i1> %pg,
<vscale x 4 x i32> %a,
<vscale x 8 x i16> %b)
ret <vscale x 4 x i32> %out
}
define <vscale x 2 x i64> @uadalp_i32(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: uadalp_i32:
; CHECK: uadalp z0.d, p0/m, z1.s
; CHECK-NEXT: ret
%out = call <vscale x 2 x i64> @llvm.aarch64.sve.uadalp.nxv2i64(<vscale x 2 x i1> %pg,
<vscale x 2 x i64> %a,
<vscale x 4 x i32> %b)
ret <vscale x 2 x i64> %out
}
declare <vscale x 8 x i16> @llvm.aarch64.sve.sadalp.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 16 x i8>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.sadalp.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 8 x i16>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.sadalp.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 4 x i32>)
declare <vscale x 8 x i16> @llvm.aarch64.sve.uadalp.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 16 x i8>)
declare <vscale x 4 x i32> @llvm.aarch64.sve.uadalp.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 8 x i16>)
declare <vscale x 2 x i64> @llvm.aarch64.sve.uadalp.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 4 x i32>)