[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q)

Summary:
Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h),
are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated)
indices, like so:

   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)

When %v's values are known, the shufflevector is optimized away and we are no
longer able to select the lane variant of sqdmulh in the backend.

This defeats a (hand-coded) optimization that packs several constants into a
single vector and uses the lane intrinsics to reduce register pressure and
trade-off materialising several constants for a single vector load from the
constant pool, like so:

   int16x8_t v = {2,3,4,5,6,7,8,9};
   a = vqdmulh_laneq_s16(a, v, 0);
   b = vqdmulh_laneq_s16(b, v, 1);
   c = vqdmulh_laneq_s16(c, v, 2);
   d = vqdmulh_laneq_s16(d, v, 3);
   [...]

In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4%
performance difference.

We could teach the compiler to recover the lane variants, but this would likely
require its own pass.  (Alternatively, "volatile" could be used on the constants
vector, but this is a bit ugly.)

This patch instead implements the following LLVM IR intrinsics for AArch64 to
maintain the original structure through IR optmization and into instruction
selection:
- sqdmulh_lane
- sqdmulh_laneq
- sqrdmulh_lane
- sqrdmulh_laneq.

These 'lane' variants need an additional register class.  The second argument
must be in the lower half of the 64-bit NEON register file, but only when
operating on i16 elements.

Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane
(etc.) remain, so code that does not rely on NEON intrinsics to generate these
instructions is not affected.

This patch also changes clang to emit these IR intrinsics for the corresponding
NEON intrinsics (AArch64 only).

Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma

Reviewed By: efriedma

Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D71469
This commit is contained in:
Sanne Wouda 2020-01-29 13:07:15 +00:00
parent f719b0ba13
commit 2939fc13c8
11 changed files with 549 additions and 166 deletions

View File

@ -528,9 +528,16 @@ def VMULL_LANE : SOpInst<"vmull_lane", "(>Q)..I", "siUsUi", OP_MULL_LN>;
def VQDMULL_N : SOpInst<"vqdmull_n", "(>Q).1", "si", OP_QDMULL_N>;
def VQDMULL_LANE : SOpInst<"vqdmull_lane", "(>Q)..I", "si", OP_QDMULL_LN>;
def VQDMULH_N : SOpInst<"vqdmulh_n", "..1", "siQsQi", OP_QDMULH_N>;
def VQDMULH_LANE : SOpInst<"vqdmulh_lane", "..qI", "siQsQi", OP_QDMULH_LN>;
def VQRDMULH_N : SOpInst<"vqrdmulh_n", "..1", "siQsQi", OP_QRDMULH_N>;
let ArchGuard = "!defined(__aarch64__)" in {
def VQDMULH_LANE : SOpInst<"vqdmulh_lane", "..qI", "siQsQi", OP_QDMULH_LN>;
def VQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "..qI", "siQsQi", OP_QRDMULH_LN>;
}
let ArchGuard = "defined(__aarch64__)" in {
def A64_VQDMULH_LANE : SInst<"vqdmulh_lane", "..qI", "siQsQi">;
def A64_VQRDMULH_LANE : SInst<"vqrdmulh_lane", "..qI", "siQsQi">;
}
let ArchGuard = "defined(__ARM_FEATURE_QRDMX)" in {
def VQRDMLAH_LANE : SOpInst<"vqrdmlah_lane", "...qI", "siQsQi", OP_QRDMLAH_LN>;
@ -951,9 +958,10 @@ def VQDMULL_HIGH_LANE : SOpInst<"vqdmull_high_lane", "(>Q)Q.I", "si",
def VQDMULL_HIGH_LANEQ : SOpInst<"vqdmull_high_laneq", "(>Q)QQI", "si",
OP_QDMULLHi_LN>;
def VQDMULH_LANEQ : SOpInst<"vqdmulh_laneq", "..QI", "siQsQi", OP_QDMULH_LN>;
def VQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "..QI", "siQsQi", OP_QRDMULH_LN>;
let isLaneQ = 1 in {
def VQDMULH_LANEQ : SInst<"vqdmulh_laneq", "..QI", "siQsQi">;
def VQRDMULH_LANEQ : SInst<"vqrdmulh_laneq", "..QI", "siQsQi">;
}
let ArchGuard = "defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)" in {
def VQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "...QI", "siQsQi", OP_QRDMLAH_LN>;
def VQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "...QI", "siQsQi", OP_QRDMLSH_LN>;

View File

@ -4969,14 +4969,22 @@ static const NeonIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
NEONMAP2(vqaddq_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts),
NEONMAP2(vqdmlal_v, aarch64_neon_sqdmull, aarch64_neon_sqadd, 0),
NEONMAP2(vqdmlsl_v, aarch64_neon_sqdmull, aarch64_neon_sqsub, 0),
NEONMAP1(vqdmulh_lane_v, aarch64_neon_sqdmulh_lane, 0),
NEONMAP1(vqdmulh_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
NEONMAP1(vqdmulh_v, aarch64_neon_sqdmulh, Add1ArgType),
NEONMAP1(vqdmulhq_lane_v, aarch64_neon_sqdmulh_lane, 0),
NEONMAP1(vqdmulhq_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
NEONMAP1(vqdmulhq_v, aarch64_neon_sqdmulh, Add1ArgType),
NEONMAP1(vqdmull_v, aarch64_neon_sqdmull, Add1ArgType),
NEONMAP2(vqmovn_v, aarch64_neon_uqxtn, aarch64_neon_sqxtn, Add1ArgType | UnsignedAlts),
NEONMAP1(vqmovun_v, aarch64_neon_sqxtun, Add1ArgType),
NEONMAP1(vqneg_v, aarch64_neon_sqneg, Add1ArgType),
NEONMAP1(vqnegq_v, aarch64_neon_sqneg, Add1ArgType),
NEONMAP1(vqrdmulh_lane_v, aarch64_neon_sqrdmulh_lane, 0),
NEONMAP1(vqrdmulh_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
NEONMAP1(vqrdmulh_v, aarch64_neon_sqrdmulh, Add1ArgType),
NEONMAP1(vqrdmulhq_lane_v, aarch64_neon_sqrdmulh_lane, 0),
NEONMAP1(vqrdmulhq_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
NEONMAP1(vqrdmulhq_v, aarch64_neon_sqrdmulh, Add1ArgType),
NEONMAP2(vqrshl_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
NEONMAP2(vqrshlq_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
@ -5754,6 +5762,24 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
Ops.resize(2);
return EmitNeonCall(CGM.getIntrinsic(AltLLVMIntrinsic, Ty), Ops, NameHint);
}
case NEON::BI__builtin_neon_vqdmulhq_lane_v:
case NEON::BI__builtin_neon_vqdmulh_lane_v:
case NEON::BI__builtin_neon_vqrdmulhq_lane_v:
case NEON::BI__builtin_neon_vqrdmulh_lane_v: {
llvm::Type *Tys[2] = {
Ty, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
/*isQuad*/ false))};
return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
}
case NEON::BI__builtin_neon_vqdmulhq_laneq_v:
case NEON::BI__builtin_neon_vqdmulh_laneq_v:
case NEON::BI__builtin_neon_vqrdmulhq_laneq_v:
case NEON::BI__builtin_neon_vqrdmulh_laneq_v: {
llvm::Type *Tys[2] = {
Ty, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
/*isQuad*/ true))};
return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
}
case NEON::BI__builtin_neon_vqshl_n_v:
case NEON::BI__builtin_neon_vqshlq_n_v:
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshl_n",

View File

@ -1440,12 +1440,12 @@ int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
// CHECK-LABEL: @test_vqdmulh_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 3)
// CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANE_V2]]
//
int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
return vqdmulh_lane_s16(a, v, 3);
@ -1453,12 +1453,12 @@ int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
// CHECK-LABEL: @test_vqdmulhq_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 3)
// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANE_V2]]
//
int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
return vqdmulhq_lane_s16(a, v, 3);
@ -1466,12 +1466,12 @@ int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
// CHECK-LABEL: @test_vqdmulh_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 1)
// CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANE_V2]]
//
int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
return vqdmulh_lane_s32(a, v, 1);
@ -1479,12 +1479,12 @@ int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
// CHECK-LABEL: @test_vqdmulhq_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 1)
// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANE_V2]]
//
int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
return vqdmulhq_lane_s32(a, v, 1);
@ -1492,12 +1492,12 @@ int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
// CHECK-LABEL: @test_vqrdmulh_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 3)
// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANE_V2]]
//
int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
return vqrdmulh_lane_s16(a, v, 3);
@ -1505,12 +1505,12 @@ int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
// CHECK-LABEL: @test_vqrdmulhq_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 3)
// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANE_V2]]
//
int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
return vqrdmulhq_lane_s16(a, v, 3);
@ -1518,12 +1518,12 @@ int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
// CHECK-LABEL: @test_vqrdmulh_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 1)
// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANE_V2]]
//
int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
return vqrdmulh_lane_s32(a, v, 1);
@ -1531,12 +1531,12 @@ int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
// CHECK-LABEL: @test_vqrdmulhq_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 1)
// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANE_V2]]
//
int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
return vqrdmulhq_lane_s32(a, v, 1);
@ -3066,12 +3066,12 @@ int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
// CHECK-LABEL: @test_vqdmulh_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 0)
// CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANE_V2]]
//
int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
return vqdmulh_lane_s16(a, v, 0);
@ -3079,12 +3079,12 @@ int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
// CHECK-LABEL: @test_vqdmulhq_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 0)
// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANE_V2]]
//
int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
return vqdmulhq_lane_s16(a, v, 0);
@ -3092,12 +3092,12 @@ int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
// CHECK-LABEL: @test_vqdmulh_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 0)
// CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANE_V2]]
//
int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
return vqdmulh_lane_s32(a, v, 0);
@ -3105,12 +3105,12 @@ int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
// CHECK-LABEL: @test_vqdmulhq_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 0)
// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANE_V2]]
//
int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
return vqdmulhq_lane_s32(a, v, 0);
@ -3118,12 +3118,12 @@ int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
// CHECK-LABEL: @test_vqrdmulh_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 0)
// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANE_V2]]
//
int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
return vqrdmulh_lane_s16(a, v, 0);
@ -3131,12 +3131,12 @@ int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
// CHECK-LABEL: @test_vqrdmulhq_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 0)
// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANE_V2]]
//
int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
return vqrdmulhq_lane_s16(a, v, 0);
@ -3144,12 +3144,12 @@ int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
// CHECK-LABEL: @test_vqrdmulh_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 0)
// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANE_V2]]
//
int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
return vqrdmulh_lane_s32(a, v, 0);
@ -3157,12 +3157,12 @@ int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
// CHECK-LABEL: @test_vqrdmulhq_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 0)
// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANE_V2]]
//
int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
return vqrdmulhq_lane_s32(a, v, 0);
@ -4753,12 +4753,12 @@ int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
// CHECK-LABEL: @test_vqdmulh_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 0)
// CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANEQ_V2]]
//
int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
return vqdmulh_laneq_s16(a, v, 0);
@ -4766,12 +4766,12 @@ int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
// CHECK-LABEL: @test_vqdmulhq_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 0)
// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANEQ_V2]]
//
int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
return vqdmulhq_laneq_s16(a, v, 0);
@ -4779,12 +4779,12 @@ int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
// CHECK-LABEL: @test_vqdmulh_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 0)
// CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANEQ_V2]]
//
int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
return vqdmulh_laneq_s32(a, v, 0);
@ -4792,12 +4792,12 @@ int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
// CHECK-LABEL: @test_vqdmulhq_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 0)
// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANEQ_V2]]
//
int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
return vqdmulhq_laneq_s32(a, v, 0);
@ -4805,12 +4805,12 @@ int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
// CHECK-LABEL: @test_vqrdmulh_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 0)
// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANEQ_V2]]
//
int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
return vqrdmulh_laneq_s16(a, v, 0);
@ -4818,12 +4818,12 @@ int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
// CHECK-LABEL: @test_vqrdmulhq_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 0)
// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANEQ_V2]]
//
int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
return vqrdmulhq_laneq_s16(a, v, 0);
@ -4831,12 +4831,12 @@ int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
// CHECK-LABEL: @test_vqrdmulh_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 0)
// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANEQ_V2]]
//
int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
return vqrdmulh_laneq_s32(a, v, 0);
@ -4844,12 +4844,12 @@ int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
// CHECK-LABEL: @test_vqrdmulhq_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 0)
// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANEQ_V2]]
//
int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
return vqrdmulhq_laneq_s32(a, v, 0);
@ -5149,12 +5149,12 @@ int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
// CHECK-LABEL: @test_vqdmulh_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 7)
// CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANEQ_V2]]
//
int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
return vqdmulh_laneq_s16(a, v, 7);
@ -5162,12 +5162,12 @@ int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
// CHECK-LABEL: @test_vqdmulhq_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 7)
// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANEQ_V2]]
//
int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
return vqdmulhq_laneq_s16(a, v, 7);
@ -5175,12 +5175,12 @@ int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
// CHECK-LABEL: @test_vqdmulh_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 3)
// CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANEQ_V2]]
//
int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
return vqdmulh_laneq_s32(a, v, 3);
@ -5188,12 +5188,12 @@ int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
// CHECK-LABEL: @test_vqdmulhq_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 3)
// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANEQ_V2]]
//
int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
return vqdmulhq_laneq_s32(a, v, 3);
@ -5201,12 +5201,12 @@ int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
// CHECK-LABEL: @test_vqrdmulh_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 7)
// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANEQ_V2]]
//
int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
return vqrdmulh_laneq_s16(a, v, 7);
@ -5214,12 +5214,12 @@ int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
// CHECK-LABEL: @test_vqrdmulhq_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 7)
// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANEQ_V2]]
//
int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
return vqrdmulhq_laneq_s16(a, v, 7);
@ -5227,12 +5227,12 @@ int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
// CHECK-LABEL: @test_vqrdmulh_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 3)
// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANEQ_V2]]
//
int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
return vqrdmulh_laneq_s32(a, v, 3);
@ -5240,12 +5240,12 @@ int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
// CHECK-LABEL: @test_vqrdmulhq_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 3)
// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANEQ_V2]]
//
int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
return vqrdmulhq_laneq_s32(a, v, 3);

View File

@ -133,6 +133,10 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
: Intrinsic<[llvm_anyvector_ty],
[LLVMHalfElementsVectorType<0>, llvm_anyvector_ty],
[IntrNoMem]>;
class AdvSIMD_2VectorArg_Lane_Intrinsic
: Intrinsic<[llvm_anyint_ty],
[LLVMMatchType<0>, llvm_anyint_ty, llvm_i32_ty],
[IntrNoMem]>;
class AdvSIMD_3VectorArg_Intrinsic
: Intrinsic<[llvm_anyvector_ty],
@ -207,9 +211,13 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
// Vector Saturating Doubling Multiply High
def int_aarch64_neon_sqdmulh : AdvSIMD_2IntArg_Intrinsic;
def int_aarch64_neon_sqdmulh_lane : AdvSIMD_2VectorArg_Lane_Intrinsic;
def int_aarch64_neon_sqdmulh_laneq : AdvSIMD_2VectorArg_Lane_Intrinsic;
// Vector Saturating Rounding Doubling Multiply High
def int_aarch64_neon_sqrdmulh : AdvSIMD_2IntArg_Intrinsic;
def int_aarch64_neon_sqrdmulh_lane : AdvSIMD_2VectorArg_Lane_Intrinsic;
def int_aarch64_neon_sqrdmulh_laneq : AdvSIMD_2VectorArg_Lane_Intrinsic;
// Vector Polynominal Multiply
def int_aarch64_neon_pmul : AdvSIMD_2VectorArg_Intrinsic;

View File

@ -360,6 +360,9 @@ def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
def am_indexedu6s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedU6S128", []>;
def am_indexeds9s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedS9S128", []>;
def UImmS1XForm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i64);
}]>;
def UImmS2XForm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue() / 2, SDLoc(N), MVT::i64);
}]>;
@ -7968,6 +7971,64 @@ multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> {
}
}
multiclass SIMDIndexedHSPatterns<SDPatternOperator OpNodeLane,
SDPatternOperator OpNodeLaneQ> {
def : Pat<(v4i16 (OpNodeLane
(v4i16 V64:$Rn), (v4i16 V64_lo:$Rm),
VectorIndexS32b:$idx)),
(!cast<Instruction>(NAME # v4i16_indexed) $Rn,
(SUBREG_TO_REG (i32 0), (v4i16 V64_lo:$Rm), dsub),
(UImmS1XForm $idx))>;
def : Pat<(v4i16 (OpNodeLaneQ
(v4i16 V64:$Rn), (v8i16 V128_lo:$Rm),
VectorIndexH32b:$idx)),
(!cast<Instruction>(NAME # v4i16_indexed) $Rn, $Rm,
(UImmS1XForm $idx))>;
def : Pat<(v8i16 (OpNodeLane
(v8i16 V128:$Rn), (v4i16 V64_lo:$Rm),
VectorIndexS32b:$idx)),
(!cast<Instruction>(NAME # v8i16_indexed) $Rn,
(SUBREG_TO_REG (i32 0), $Rm, dsub),
(UImmS1XForm $idx))>;
def : Pat<(v8i16 (OpNodeLaneQ
(v8i16 V128:$Rn), (v8i16 V128_lo:$Rm),
VectorIndexH32b:$idx)),
(!cast<Instruction>(NAME # v8i16_indexed) $Rn, $Rm,
(UImmS1XForm $idx))>;
def : Pat<(v2i32 (OpNodeLane
(v2i32 V64:$Rn), (v2i32 V64:$Rm),
VectorIndexD32b:$idx)),
(!cast<Instruction>(NAME # v2i32_indexed) $Rn,
(SUBREG_TO_REG (i32 0), (v2i32 V64_lo:$Rm), dsub),
(UImmS1XForm $idx))>;
def : Pat<(v2i32 (OpNodeLaneQ
(v2i32 V64:$Rn), (v4i32 V128:$Rm),
VectorIndexS32b:$idx)),
(!cast<Instruction>(NAME # v2i32_indexed) $Rn, $Rm,
(UImmS1XForm $idx))>;
def : Pat<(v4i32 (OpNodeLane
(v4i32 V128:$Rn), (v2i32 V64:$Rm),
VectorIndexD32b:$idx)),
(!cast<Instruction>(NAME # v4i32_indexed) $Rn,
(SUBREG_TO_REG (i32 0), $Rm, dsub),
(UImmS1XForm $idx))>;
def : Pat<(v4i32 (OpNodeLaneQ
(v4i32 V128:$Rn),
(v4i32 V128:$Rm),
VectorIndexS32b:$idx)),
(!cast<Instruction>(NAME # v4i32_indexed) $Rn, $Rm,
(UImmS1XForm $idx))>;
}
multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode> {
def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,

View File

@ -5631,6 +5631,11 @@ def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
defm SQDMULH : SIMDIndexedHSPatterns<int_aarch64_neon_sqdmulh_lane,
int_aarch64_neon_sqdmulh_laneq>;
defm SQRDMULH : SIMDIndexedHSPatterns<int_aarch64_neon_sqrdmulh_lane,
int_aarch64_neon_sqrdmulh_laneq>;
// Generated by MachineCombine
defm MLA : SIMDVectorIndexedHSTied<1, 0b0000, "mla", null_frag>;
defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls", null_frag>;

View File

@ -230,6 +230,7 @@ AArch64RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
case AArch64::FPR16RegClassID:
case AArch64::FPR32RegClassID:
case AArch64::FPR64RegClassID:
case AArch64::FPR64_loRegClassID:
case AArch64::FPR128RegClassID:
case AArch64::FPR128_loRegClassID:
case AArch64::DDRegClassID:

View File

@ -596,6 +596,7 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
return 32;
case AArch64::FPR128_loRegClassID:
case AArch64::FPR64_loRegClassID:
return 16;
}
}

View File

@ -429,6 +429,10 @@ def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>;
def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
v1i64, v4f16],
64, (sequence "D%u", 0, 31)>;
def FPR64_lo : RegisterClass<"AArch64",
[v8i8, v4i16, v2i32, v1i64, v4f16, v2f32, v1f64],
64, (trunc FPR64, 16)>;
// We don't (yet) have an f128 legal type, so don't use that here. We
// normalize 128-bit vectors to v2f64 for arg passing and such, so use
// that here.
@ -503,6 +507,9 @@ def VectorRegLoAsmOperand : AsmOperandClass {
let Name = "VectorRegLo";
let PredicateMethod = "isNeonVectorRegLo";
}
def V64_lo : RegisterOperand<FPR64_lo, "printVRegOperand"> {
let ParserMatchClass = VectorRegLoAsmOperand;
}
def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> {
let ParserMatchClass = VectorRegLoAsmOperand;
}

View File

@ -1033,8 +1033,10 @@ public:
bool isNeonVectorRegLo() const {
return Kind == k_Register && Reg.Kind == RegKind::NeonVector &&
AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
Reg.RegNum);
(AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
Reg.RegNum) ||
AArch64MCRegisterClasses[AArch64::FPR64_loRegClassID].contains(
Reg.RegNum));
}
template <unsigned Class> bool isSVEVectorReg() const {

View File

@ -9,20 +9,36 @@ declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32>, <2 x i32>, i32)
declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32>, <4 x i32>, i32)
declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32>, <2 x i32>, i32)
declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32>, <4 x i32>, i32)
declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16>, <4 x i16>, i32)
declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16>, <8 x i16>, i32)
declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16>, <4 x i16>, i32)
declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16>, <8 x i16>, i32)
declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
declare <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32>, <2 x i32>, i32)
declare <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32>, <4 x i32>, i32)
declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
declare <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32>, <2 x i32>, i32)
declare <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32>, <4 x i32>, i32)
declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
declare <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16>, <4 x i16>, i32)
declare <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16>, <8 x i16>, i32)
declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
declare <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16>, <4 x i16>, i32)
declare <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16>, <8 x i16>, i32)
declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
@ -1515,6 +1531,37 @@ entry:
ret <4 x i16> %vqdmulh2.i
}
define <4 x i16> @test_vqdmulh_lane_s16_intrinsic(<4 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqdmulh_lane_s16_intrinsic:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqdmulh v0.4h, v0.4h, v1.h[3]
; CHECK-NEXT: ret
entry:
%vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> %a, <4 x i16> %v, i32 3)
ret <4 x i16> %vqdmulh2.i
}
define <4 x i16> @test_vqdmulh_laneq_s16_intrinsic_lo(<4 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vqdmulh_laneq_s16_intrinsic_lo:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqdmulh v0.4h, v0.4h, v1.h[3]
; CHECK-NEXT: ret
entry:
%vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> %a, <8 x i16> %v, i32 3)
ret <4 x i16> %vqdmulh2.i
}
define <4 x i16> @test_vqdmulh_laneq_s16_intrinsic_hi(<4 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vqdmulh_laneq_s16_intrinsic_hi:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqdmulh v0.4h, v0.4h, v1.h[7]
; CHECK-NEXT: ret
entry:
%vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> %a, <8 x i16> %v, i32 7)
ret <4 x i16> %vqdmulh2.i
}
define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqdmulhq_lane_s16:
; CHECK: // %bb.0: // %entry
@ -1527,6 +1574,37 @@ entry:
ret <8 x i16> %vqdmulh2.i
}
define <8 x i16> @test_vqdmulhq_lane_s16_intrinsic(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqdmulhq_lane_s16_intrinsic:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqdmulh v0.8h, v0.8h, v1.h[3]
; CHECK-NEXT: ret
entry:
%vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> %a, <4 x i16> %v, i32 3)
ret <8 x i16> %vqdmulh2.i
}
define <8 x i16> @test_vqdmulhq_laneq_s16_intrinsic_lo(<8 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vqdmulhq_laneq_s16_intrinsic_lo:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqdmulh v0.8h, v0.8h, v1.h[3]
; CHECK-NEXT: ret
entry:
%vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> %a, <8 x i16> %v, i32 3)
ret <8 x i16> %vqdmulh2.i
}
define <8 x i16> @test_vqdmulhq_laneq_s16_intrinsic_hi(<8 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vqdmulhq_laneq_s16_intrinsic_hi:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqdmulh v0.8h, v0.8h, v1.h[7]
; CHECK-NEXT: ret
entry:
%vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> %a, <8 x i16> %v, i32 7)
ret <8 x i16> %vqdmulh2.i
}
define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmulh_lane_s32:
; CHECK: // %bb.0: // %entry
@ -1539,6 +1617,37 @@ entry:
ret <2 x i32> %vqdmulh2.i
}
define <2 x i32> @test_vqdmulh_lane_s32_intrinsic(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmulh_lane_s32_intrinsic:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqdmulh v0.2s, v0.2s, v1.s[1]
; CHECK-NEXT: ret
entry:
%vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> %a, <2 x i32> %v, i32 1)
ret <2 x i32> %vqdmulh2.i
}
define <2 x i32> @test_vqdmulh_laneq_s32_intrinsic_lo(<2 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vqdmulh_laneq_s32_intrinsic_lo:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqdmulh v0.2s, v0.2s, v1.s[1]
; CHECK-NEXT: ret
entry:
%vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> %a, <4 x i32> %v, i32 1)
ret <2 x i32> %vqdmulh2.i
}
define <2 x i32> @test_vqdmulh_laneq_s32_intrinsic_hi(<2 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vqdmulh_laneq_s32_intrinsic_hi:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqdmulh v0.2s, v0.2s, v1.s[3]
; CHECK-NEXT: ret
entry:
%vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> %a, <4 x i32> %v, i32 3)
ret <2 x i32> %vqdmulh2.i
}
define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmulhq_lane_s32:
; CHECK: // %bb.0: // %entry
@ -1551,6 +1660,37 @@ entry:
ret <4 x i32> %vqdmulh2.i
}
define <4 x i32> @test_vqdmulhq_lane_s32_intrinsic(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmulhq_lane_s32_intrinsic:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqdmulh v0.4s, v0.4s, v1.s[1]
; CHECK-NEXT: ret
entry:
%vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> %a, <2 x i32> %v, i32 1)
ret <4 x i32> %vqdmulh2.i
}
define <4 x i32> @test_vqdmulhq_laneq_s32_intrinsic_lo(<4 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vqdmulhq_laneq_s32_intrinsic_lo:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqdmulh v0.4s, v0.4s, v1.s[1]
; CHECK-NEXT: ret
entry:
%vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> %a, <4 x i32> %v, i32 1)
ret <4 x i32> %vqdmulh2.i
}
define <4 x i32> @test_vqdmulhq_laneq_s32_intrinsic_hi(<4 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vqdmulhq_laneq_s32_intrinsic_hi:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqdmulh v0.4s, v0.4s, v1.s[3]
; CHECK-NEXT: ret
entry:
%vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> %a, <4 x i32> %v, i32 3)
ret <4 x i32> %vqdmulh2.i
}
define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqrdmulh_lane_s16:
; CHECK: // %bb.0: // %entry
@ -1563,6 +1703,37 @@ entry:
ret <4 x i16> %vqrdmulh2.i
}
define <4 x i16> @test_vqrdmulh_lane_s16_intrinsic(<4 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqrdmulh_lane_s16_intrinsic:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[3]
; CHECK-NEXT: ret
entry:
%vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> %a, <4 x i16> %v, i32 3)
ret <4 x i16> %vqrdmulh2.i
}
define <4 x i16> @test_vqrdmulh_laneq_s16_intrinsic_lo(<4 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vqrdmulh_laneq_s16_intrinsic_lo:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[3]
; CHECK-NEXT: ret
entry:
%vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> %a, <8 x i16> %v, i32 3)
ret <4 x i16> %vqrdmulh2.i
}
define <4 x i16> @test_vqrdmulh_laneq_s16_intrinsic_hi(<4 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vqrdmulh_laneq_s16_intrinsic_hi:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[7]
; CHECK-NEXT: ret
entry:
%vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> %a, <8 x i16> %v, i32 7)
ret <4 x i16> %vqrdmulh2.i
}
define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqrdmulhq_lane_s16:
; CHECK: // %bb.0: // %entry
@ -1575,6 +1746,37 @@ entry:
ret <8 x i16> %vqrdmulh2.i
}
define <8 x i16> @test_vqrdmulhq_lane_s16_intrinsic(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqrdmulhq_lane_s16_intrinsic:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[3]
; CHECK-NEXT: ret
entry:
%vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> %a, <4 x i16> %v, i32 3)
ret <8 x i16> %vqrdmulh2.i
}
define <8 x i16> @test_vqrdmulhq_laneq_s16_intrinsic_lo(<8 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vqrdmulhq_laneq_s16_intrinsic_lo:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[3]
; CHECK-NEXT: ret
entry:
%vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> %a, <8 x i16> %v, i32 3)
ret <8 x i16> %vqrdmulh2.i
}
define <8 x i16> @test_vqrdmulhq_laneq_s16_intrinsic_hi(<8 x i16> %a, <8 x i16> %v) {
; CHECK-LABEL: test_vqrdmulhq_laneq_s16_intrinsic_hi:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[7]
; CHECK-NEXT: ret
entry:
%vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> %a, <8 x i16> %v, i32 7)
ret <8 x i16> %vqrdmulh2.i
}
define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqrdmulh_lane_s32:
; CHECK: // %bb.0: // %entry
@ -1587,6 +1789,37 @@ entry:
ret <2 x i32> %vqrdmulh2.i
}
define <2 x i32> @test_vqrdmulh_lane_s32_intrinsic(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqrdmulh_lane_s32_intrinsic:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[1]
; CHECK-NEXT: ret
entry:
%vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> %a, <2 x i32> %v, i32 1)
ret <2 x i32> %vqrdmulh2.i
}
define <2 x i32> @test_vqrdmulh_laneq_s32_intrinsic_lo(<2 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vqrdmulh_laneq_s32_intrinsic_lo:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[1]
; CHECK-NEXT: ret
entry:
%vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> %a, <4 x i32> %v, i32 1)
ret <2 x i32> %vqrdmulh2.i
}
define <2 x i32> @test_vqrdmulh_laneq_s32_intrinsic_hi(<2 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vqrdmulh_laneq_s32_intrinsic_hi:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[3]
; CHECK-NEXT: ret
entry:
%vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> %a, <4 x i32> %v, i32 3)
ret <2 x i32> %vqrdmulh2.i
}
define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqrdmulhq_lane_s32:
; CHECK: // %bb.0: // %entry
@ -1599,6 +1832,37 @@ entry:
ret <4 x i32> %vqrdmulh2.i
}
define <4 x i32> @test_vqrdmulhq_lane_s32_intrinsic(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqrdmulhq_lane_s32_intrinsic:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[1]
; CHECK-NEXT: ret
entry:
%vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> %a, <2 x i32> %v, i32 1)
ret <4 x i32> %vqrdmulh2.i
}
define <4 x i32> @test_vqrdmulhq_laneq_s32_intrinsic_lo(<4 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vqrdmulhq_laneq_s32_intrinsic_lo:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[1]
; CHECK-NEXT: ret
entry:
%vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> %a, <4 x i32> %v, i32 1)
ret <4 x i32> %vqrdmulh2.i
}
define <4 x i32> @test_vqrdmulhq_laneq_s32_intrinsic_hi(<4 x i32> %a, <4 x i32> %v) {
; CHECK-LABEL: test_vqrdmulhq_laneq_s32_intrinsic_hi:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[3]
; CHECK-NEXT: ret
entry:
%vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> %a, <4 x i32> %v, i32 3)
ret <4 x i32> %vqrdmulh2.i
}
define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {
; CHECK-LABEL: test_vmul_lane_f32:
; CHECK: // %bb.0: // %entry