llvm-project/clang/test/CodeGen/aarch64-neon-2velem.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

5777 lines
311 KiB
C
Raw Normal View History

// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
// Test new aarch64 intrinsics and types
#include <arm_neon.h>
// CHECK-LABEL: @test_vmla_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i16> [[ADD]]
//
int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
return vmla_lane_s16(a, b, v, 3);
}
// CHECK-LABEL: @test_vmlaq_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <8 x i16> [[ADD]]
//
int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
return vmlaq_lane_s16(a, b, v, 3);
}
// CHECK-LABEL: @test_vmla_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <2 x i32> [[ADD]]
//
int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
return vmla_lane_s32(a, b, v, 1);
}
// CHECK-LABEL: @test_vmlaq_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
return vmlaq_lane_s32(a, b, v, 1);
}
// CHECK-LABEL: @test_vmla_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i16> [[ADD]]
//
int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
return vmla_laneq_s16(a, b, v, 7);
}
// CHECK-LABEL: @test_vmlaq_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <8 x i16> [[ADD]]
//
int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
return vmlaq_laneq_s16(a, b, v, 7);
}
// CHECK-LABEL: @test_vmla_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <2 x i32> [[ADD]]
//
int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
return vmla_laneq_s32(a, b, v, 3);
}
// CHECK-LABEL: @test_vmlaq_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
return vmlaq_laneq_s32(a, b, v, 3);
}
// CHECK-LABEL: @test_vmls_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i16> [[SUB]]
//
int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
return vmls_lane_s16(a, b, v, 3);
}
// CHECK-LABEL: @test_vmlsq_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <8 x i16> [[SUB]]
//
int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
return vmlsq_lane_s16(a, b, v, 3);
}
// CHECK-LABEL: @test_vmls_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <2 x i32> [[SUB]]
//
int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
return vmls_lane_s32(a, b, v, 1);
}
// CHECK-LABEL: @test_vmlsq_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
return vmlsq_lane_s32(a, b, v, 1);
}
// CHECK-LABEL: @test_vmls_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i16> [[SUB]]
//
int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
return vmls_laneq_s16(a, b, v, 7);
}
// CHECK-LABEL: @test_vmlsq_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <8 x i16> [[SUB]]
//
int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
return vmlsq_laneq_s16(a, b, v, 7);
}
// CHECK-LABEL: @test_vmls_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <2 x i32> [[SUB]]
//
int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
return vmls_laneq_s32(a, b, v, 3);
}
// CHECK-LABEL: @test_vmlsq_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
return vmlsq_laneq_s32(a, b, v, 3);
}
// CHECK-LABEL: @test_vmul_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <4 x i16> [[MUL]]
//
int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) {
return vmul_lane_s16(a, v, 3);
}
// CHECK-LABEL: @test_vmulq_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <8 x i16> [[MUL]]
//
int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) {
return vmulq_lane_s16(a, v, 3);
}
// CHECK-LABEL: @test_vmul_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <2 x i32> [[MUL]]
//
int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) {
return vmul_lane_s32(a, v, 1);
}
// CHECK-LABEL: @test_vmulq_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <4 x i32> [[MUL]]
//
int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) {
return vmulq_lane_s32(a, v, 1);
}
// CHECK-LABEL: @test_vmul_lane_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <4 x i16> [[MUL]]
//
uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) {
return vmul_lane_u16(a, v, 3);
}
// CHECK-LABEL: @test_vmulq_lane_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <8 x i16> [[MUL]]
//
uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) {
return vmulq_lane_u16(a, v, 3);
}
// CHECK-LABEL: @test_vmul_lane_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <2 x i32> [[MUL]]
//
uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) {
return vmul_lane_u32(a, v, 1);
}
// CHECK-LABEL: @test_vmulq_lane_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <4 x i32> [[MUL]]
//
uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) {
return vmulq_lane_u32(a, v, 1);
}
// CHECK-LABEL: @test_vmul_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <4 x i16> [[MUL]]
//
int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) {
return vmul_laneq_s16(a, v, 7);
}
// CHECK-LABEL: @test_vmulq_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <8 x i16> [[MUL]]
//
int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) {
return vmulq_laneq_s16(a, v, 7);
}
// CHECK-LABEL: @test_vmul_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <2 x i32> [[MUL]]
//
int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) {
return vmul_laneq_s32(a, v, 3);
}
// CHECK-LABEL: @test_vmulq_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <4 x i32> [[MUL]]
//
int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) {
return vmulq_laneq_s32(a, v, 3);
}
// CHECK-LABEL: @test_vmul_laneq_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <4 x i16> [[MUL]]
//
uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) {
return vmul_laneq_u16(a, v, 7);
}
// CHECK-LABEL: @test_vmulq_laneq_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <8 x i16> [[MUL]]
//
uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) {
return vmulq_laneq_u16(a, v, 7);
}
// CHECK-LABEL: @test_vmul_laneq_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <2 x i32> [[MUL]]
//
uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) {
return vmul_laneq_u32(a, v, 3);
}
// CHECK-LABEL: @test_vmulq_laneq_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <4 x i32> [[MUL]]
//
uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) {
return vmulq_laneq_u32(a, v, 3);
}
// CHECK-LABEL: @test_vfma_lane_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
// CHECK-NEXT: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
// CHECK-NEXT: ret <2 x float> [[FMLA2]]
//
float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
return vfma_lane_f32(a, b, v, 1);
}
// CHECK-LABEL: @test_vfmaq_lane_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
// CHECK-NEXT: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
// CHECK-NEXT: ret <4 x float> [[FMLA2]]
//
float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
return vfmaq_lane_f32(a, b, v, 1);
}
// CHECK-LABEL: @test_vfma_laneq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
// CHECK-NEXT: ret <2 x float> [[TMP6]]
//
float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
return vfma_laneq_f32(a, b, v, 3);
}
// CHECK-LABEL: @test_vfmaq_laneq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
// CHECK-NEXT: ret <4 x float> [[TMP6]]
//
float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
return vfmaq_laneq_f32(a, b, v, 3);
}
// CHECK-LABEL: @test_vfms_lane_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]]
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
// CHECK-NEXT: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
// CHECK-NEXT: ret <2 x float> [[FMLA2]]
//
float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
return vfms_lane_f32(a, b, v, 1);
}
// CHECK-LABEL: @test_vfmsq_lane_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]]
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
// CHECK-NEXT: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
// CHECK-NEXT: ret <4 x float> [[FMLA2]]
//
float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
return vfmsq_lane_f32(a, b, v, 1);
}
// CHECK-LABEL: @test_vfms_laneq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]]
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
// CHECK-NEXT: ret <2 x float> [[TMP6]]
//
float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
return vfms_laneq_f32(a, b, v, 3);
}
// CHECK-LABEL: @test_vfmsq_laneq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]]
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
// CHECK-NEXT: ret <4 x float> [[TMP6]]
//
float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
return vfmsq_laneq_f32(a, b, v, 3);
}
// CHECK-LABEL: @test_vfmaq_lane_f64(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
// CHECK-NEXT: [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
// CHECK-NEXT: ret <2 x double> [[FMLA2]]
//
float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
return vfmaq_lane_f64(a, b, v, 0);
}
// CHECK-LABEL: @test_vfmaq_laneq_f64(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
// CHECK-NEXT: ret <2 x double> [[TMP6]]
//
float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
return vfmaq_laneq_f64(a, b, v, 1);
}
// CHECK-LABEL: @test_vfmsq_lane_f64(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]]
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
// CHECK-NEXT: [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
// CHECK-NEXT: ret <2 x double> [[FMLA2]]
//
float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
return vfmsq_lane_f64(a, b, v, 0);
}
// CHECK-LABEL: @test_vfmsq_laneq_f64(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]]
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
// CHECK-NEXT: ret <2 x double> [[TMP6]]
//
float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
return vfmsq_laneq_f64(a, b, v, 1);
}
// CHECK-LABEL: @test_vfmas_laneq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <4 x float> [[V:%.*]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B:%.*]], float [[EXTRACT]], float [[A:%.*]])
// CHECK-NEXT: ret float [[TMP0]]
//
float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
return vfmas_laneq_f32(a, b, v, 3);
}
// CHECK-LABEL: @test_vfmsd_lane_f64(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[FNEG:%.*]] = fneg double [[B:%.*]]
// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <1 x double> [[V:%.*]], i32 0
// CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.fma.f64(double [[FNEG]], double [[EXTRACT]], double [[A:%.*]])
// CHECK-NEXT: ret double [[TMP0]]
//
float64_t test_vfmsd_lane_f64(float64_t a, float64_t b, float64x1_t v) {
return vfmsd_lane_f64(a, b, v, 0);
}
// CHECK-LABEL: @test_vfmss_laneq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[FNEG:%.*]] = fneg float [[B:%.*]]
// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <4 x float> [[V:%.*]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.fma.f32(float [[FNEG]], float [[EXTRACT]], float [[A:%.*]])
// CHECK-NEXT: ret float [[TMP0]]
//
float32_t test_vfmss_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
return vfmss_laneq_f32(a, b, v, 3);
}
// CHECK-LABEL: @test_vfmsd_laneq_f64(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[FNEG:%.*]] = fneg double [[B:%.*]]
// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[V:%.*]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.fma.f64(double [[FNEG]], double [[EXTRACT]], double [[A:%.*]])
// CHECK-NEXT: ret double [[TMP0]]
//
float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) {
return vfmsd_laneq_f64(a, b, v, 1);
}
// CHECK-LABEL: @test_vmlal_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
return vmlal_lane_s16(a, b, v, 3);
}
// CHECK-LABEL: @test_vmlal_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
return vmlal_lane_s32(a, b, v, 1);
}
// CHECK-LABEL: @test_vmlal_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
return vmlal_laneq_s16(a, b, v, 7);
}
// CHECK-LABEL: @test_vmlal_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
return vmlal_laneq_s32(a, b, v, 3);
}
// CHECK-LABEL: @test_vmlal_high_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
return vmlal_high_lane_s16(a, b, v, 3);
}
// CHECK-LABEL: @test_vmlal_high_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
return vmlal_high_lane_s32(a, b, v, 1);
}
// CHECK-LABEL: @test_vmlal_high_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
return vmlal_high_laneq_s16(a, b, v, 7);
}
// CHECK-LABEL: @test_vmlal_high_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
return vmlal_high_laneq_s32(a, b, v, 3);
}
// CHECK-LABEL: @test_vmlsl_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
return vmlsl_lane_s16(a, b, v, 3);
}
// CHECK-LABEL: @test_vmlsl_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
return vmlsl_lane_s32(a, b, v, 1);
}
// CHECK-LABEL: @test_vmlsl_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
return vmlsl_laneq_s16(a, b, v, 7);
}
// CHECK-LABEL: @test_vmlsl_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
return vmlsl_laneq_s32(a, b, v, 3);
}
// CHECK-LABEL: @test_vmlsl_high_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
return vmlsl_high_lane_s16(a, b, v, 3);
}
// CHECK-LABEL: @test_vmlsl_high_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
return vmlsl_high_lane_s32(a, b, v, 1);
}
// CHECK-LABEL: @test_vmlsl_high_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
return vmlsl_high_laneq_s16(a, b, v, 7);
}
// CHECK-LABEL: @test_vmlsl_high_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
return vmlsl_high_laneq_s32(a, b, v, 3);
}
// CHECK-LABEL: @test_vmlal_lane_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
return vmlal_lane_u16(a, b, v, 3);
}
// CHECK-LABEL: @test_vmlal_lane_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
return vmlal_lane_u32(a, b, v, 1);
}
// CHECK-LABEL: @test_vmlal_laneq_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
return vmlal_laneq_u16(a, b, v, 7);
}
// CHECK-LABEL: @test_vmlal_laneq_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
return vmlal_laneq_u32(a, b, v, 3);
}
// CHECK-LABEL: @test_vmlal_high_lane_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
return vmlal_high_lane_u16(a, b, v, 3);
}
// CHECK-LABEL: @test_vmlal_high_lane_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
return vmlal_high_lane_u32(a, b, v, 1);
}
// CHECK-LABEL: @test_vmlal_high_laneq_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
return vmlal_high_laneq_u16(a, b, v, 7);
}
// CHECK-LABEL: @test_vmlal_high_laneq_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
return vmlal_high_laneq_u32(a, b, v, 3);
}
// CHECK-LABEL: @test_vmlsl_lane_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
return vmlsl_lane_u16(a, b, v, 3);
}
// CHECK-LABEL: @test_vmlsl_lane_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
return vmlsl_lane_u32(a, b, v, 1);
}
// CHECK-LABEL: @test_vmlsl_laneq_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
return vmlsl_laneq_u16(a, b, v, 7);
}
// CHECK-LABEL: @test_vmlsl_laneq_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
return vmlsl_laneq_u32(a, b, v, 3);
}
// CHECK-LABEL: @test_vmlsl_high_lane_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
return vmlsl_high_lane_u16(a, b, v, 3);
}
// CHECK-LABEL: @test_vmlsl_high_lane_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
return vmlsl_high_lane_u32(a, b, v, 1);
}
// CHECK-LABEL: @test_vmlsl_high_laneq_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
return vmlsl_high_laneq_u16(a, b, v, 7);
}
// CHECK-LABEL: @test_vmlsl_high_laneq_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
return vmlsl_high_laneq_u32(a, b, v, 3);
}
// CHECK-LABEL: @test_vmull_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
return vmull_lane_s16(a, v, 3);
}
// CHECK-LABEL: @test_vmull_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
return vmull_lane_s32(a, v, 1);
}
// CHECK-LABEL: @test_vmull_lane_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
return vmull_lane_u16(a, v, 3);
}
// CHECK-LABEL: @test_vmull_lane_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
return vmull_lane_u32(a, v, 1);
}
// CHECK-LABEL: @test_vmull_high_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
return vmull_high_lane_s16(a, v, 3);
}
// CHECK-LABEL: @test_vmull_high_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
return vmull_high_lane_s32(a, v, 1);
}
// CHECK-LABEL: @test_vmull_high_lane_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
return vmull_high_lane_u16(a, v, 3);
}
// CHECK-LABEL: @test_vmull_high_lane_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
return vmull_high_lane_u32(a, v, 1);
}
// CHECK-LABEL: @test_vmull_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
return vmull_laneq_s16(a, v, 7);
}
// CHECK-LABEL: @test_vmull_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
return vmull_laneq_s32(a, v, 3);
}
// CHECK-LABEL: @test_vmull_laneq_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
return vmull_laneq_u16(a, v, 7);
}
// CHECK-LABEL: @test_vmull_laneq_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
return vmull_laneq_u32(a, v, 3);
}
// CHECK-LABEL: @test_vmull_high_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
return vmull_high_laneq_s16(a, v, 7);
}
// CHECK-LABEL: @test_vmull_high_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
return vmull_high_laneq_s32(a, v, 3);
}
// CHECK-LABEL: @test_vmull_high_laneq_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
return vmull_high_laneq_u16(a, v, 7);
}
// CHECK-LABEL: @test_vmull_high_laneq_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
return vmull_high_laneq_u32(a, v, 3);
}
// CHECK-LABEL: @test_vqdmlal_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]]
//
int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
return vqdmlal_lane_s16(a, b, v, 3);
}
// CHECK-LABEL: @test_vqdmlal_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]]
//
int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
return vqdmlal_lane_s32(a, b, v, 1);
}
// CHECK-LABEL: @test_vqdmlal_high_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]]
//
int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
return vqdmlal_high_lane_s16(a, b, v, 3);
}
// CHECK-LABEL: @test_vqdmlal_high_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]]
//
int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
return vqdmlal_high_lane_s32(a, b, v, 1);
}
// CHECK-LABEL: @test_vqdmlsl_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]]
//
int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
return vqdmlsl_lane_s16(a, b, v, 3);
}
// CHECK-LABEL: @test_vqdmlsl_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]]
//
int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
return vqdmlsl_lane_s32(a, b, v, 1);
}
// CHECK-LABEL: @test_vqdmlsl_high_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]]
//
int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
return vqdmlsl_high_lane_s16(a, b, v, 3);
}
// CHECK-LABEL: @test_vqdmlsl_high_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]]
//
int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
return vqdmlsl_high_lane_s32(a, b, v, 1);
}
// CHECK-LABEL: @test_vqdmull_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]]
//
int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) {
return vqdmull_lane_s16(a, v, 3);
}
// CHECK-LABEL: @test_vqdmull_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]]
//
int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) {
return vqdmull_lane_s32(a, v, 1);
}
// CHECK-LABEL: @test_vqdmull_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]]
//
int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) {
return vqdmull_laneq_s16(a, v, 3);
}
// CHECK-LABEL: @test_vqdmull_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]]
//
int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) {
return vqdmull_laneq_s32(a, v, 3);
}
// CHECK-LABEL: @test_vqdmull_high_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]]
//
int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) {
return vqdmull_high_lane_s16(a, v, 3);
}
// CHECK-LABEL: @test_vqdmull_high_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]]
//
int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) {
return vqdmull_high_lane_s32(a, v, 1);
}
// CHECK-LABEL: @test_vqdmull_high_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]]
//
int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
return vqdmull_high_laneq_s16(a, v, 7);
}
// CHECK-LABEL: @test_vqdmull_high_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]]
//
int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
return vqdmull_high_laneq_s32(a, v, 3);
}
// CHECK-LABEL: @test_vqdmulh_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 3)
// CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANE_V2]]
//
int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
return vqdmulh_lane_s16(a, v, 3);
}
// CHECK-LABEL: @test_vqdmulhq_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 3)
// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANE_V2]]
//
int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
return vqdmulhq_lane_s16(a, v, 3);
}
// CHECK-LABEL: @test_vqdmulh_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 1)
// CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANE_V2]]
//
int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
return vqdmulh_lane_s32(a, v, 1);
}
// CHECK-LABEL: @test_vqdmulhq_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 1)
// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANE_V2]]
//
int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
return vqdmulhq_lane_s32(a, v, 1);
}
// CHECK-LABEL: @test_vqrdmulh_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 3)
// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANE_V2]]
//
int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
return vqrdmulh_lane_s16(a, v, 3);
}
// CHECK-LABEL: @test_vqrdmulhq_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 3)
// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANE_V2]]
//
int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
return vqrdmulhq_lane_s16(a, v, 3);
}
// CHECK-LABEL: @test_vqrdmulh_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 1)
// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANE_V2]]
//
int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
return vqrdmulh_lane_s32(a, v, 1);
}
// CHECK-LABEL: @test_vqrdmulhq_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 1)
// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANE_V2]]
//
int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
return vqrdmulhq_lane_s32(a, v, 1);
}
// CHECK-LABEL: @test_vmul_lane_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <2 x float> [[MUL]]
//
float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) {
return vmul_lane_f32(a, v, 1);
}
// CHECK-LABEL: @test_vmul_lane_f64(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
// CHECK-NEXT: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
// CHECK-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
// CHECK-NEXT: ret <1 x double> [[TMP5]]
//
float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) {
return vmul_lane_f64(a, v, 0);
}
// CHECK-LABEL: @test_vmulq_lane_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <4 x float> [[MUL]]
//
float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) {
return vmulq_lane_f32(a, v, 1);
}
// CHECK-LABEL: @test_vmulq_lane_f64(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <2 x double> [[MUL]]
//
float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) {
return vmulq_lane_f64(a, v, 0);
}
// CHECK-LABEL: @test_vmul_laneq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <2 x float> [[MUL]]
//
float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) {
return vmul_laneq_f32(a, v, 3);
}
// CHECK-LABEL: @test_vmul_laneq_f64(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
// CHECK-NEXT: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
// CHECK-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
// CHECK-NEXT: ret <1 x double> [[TMP5]]
//
float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) {
return vmul_laneq_f64(a, v, 1);
}
// CHECK-LABEL: @test_vmulq_laneq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <4 x float> [[MUL]]
//
float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) {
return vmulq_laneq_f32(a, v, 3);
}
// CHECK-LABEL: @test_vmulq_laneq_f64(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <2 x double> [[MUL]]
//
float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) {
return vmulq_laneq_f64(a, v, 1);
}
// CHECK-LABEL: @test_vmulx_lane_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4
// CHECK-NEXT: ret <2 x float> [[VMULX2_I]]
//
float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
return vmulx_lane_f32(a, v, 1);
}
// CHECK-LABEL: @test_vmulxq_lane_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4
// CHECK-NEXT: ret <4 x float> [[VMULX2_I]]
//
float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
return vmulxq_lane_f32(a, v, 1);
}
// CHECK-LABEL: @test_vmulxq_lane_f64(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4
// CHECK-NEXT: ret <2 x double> [[VMULX2_I]]
//
float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
return vmulxq_lane_f64(a, v, 0);
}
// CHECK-LABEL: @test_vmulx_laneq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4
// CHECK-NEXT: ret <2 x float> [[VMULX2_I]]
//
float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
return vmulx_laneq_f32(a, v, 3);
}
// CHECK-LABEL: @test_vmulxq_laneq_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4
// CHECK-NEXT: ret <4 x float> [[VMULX2_I]]
//
float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
return vmulxq_laneq_f32(a, v, 3);
}
// CHECK-LABEL: @test_vmulxq_laneq_f64(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4
// CHECK-NEXT: ret <2 x double> [[VMULX2_I]]
//
float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
return vmulxq_laneq_f64(a, v, 1);
}
// CHECK-LABEL: @test_vmla_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i16> [[ADD]]
//
int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
return vmla_lane_s16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlaq_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <8 x i16> [[ADD]]
//
int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
return vmlaq_lane_s16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmla_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <2 x i32> [[ADD]]
//
int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
return vmla_lane_s32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlaq_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
return vmlaq_lane_s32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmla_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i16> [[ADD]]
//
int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
return vmla_laneq_s16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlaq_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <8 x i16> [[ADD]]
//
int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
return vmlaq_laneq_s16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmla_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <2 x i32> [[ADD]]
//
int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
return vmla_laneq_s32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlaq_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
return vmlaq_laneq_s32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmls_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i16> [[SUB]]
//
int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
return vmls_lane_s16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlsq_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <8 x i16> [[SUB]]
//
int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
return vmlsq_lane_s16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmls_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <2 x i32> [[SUB]]
//
int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
return vmls_lane_s32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlsq_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
return vmlsq_lane_s32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmls_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i16> [[SUB]]
//
int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
return vmls_laneq_s16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlsq_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <8 x i16> [[SUB]]
//
int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
return vmlsq_laneq_s16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmls_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <2 x i32> [[SUB]]
//
int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
return vmls_laneq_s32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlsq_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
return vmlsq_laneq_s32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmul_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <4 x i16> [[MUL]]
//
int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) {
return vmul_lane_s16(a, v, 0);
}
// CHECK-LABEL: @test_vmulq_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <8 x i16> [[MUL]]
//
int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) {
return vmulq_lane_s16(a, v, 0);
}
// CHECK-LABEL: @test_vmul_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <2 x i32> [[MUL]]
//
int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) {
return vmul_lane_s32(a, v, 0);
}
// CHECK-LABEL: @test_vmulq_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <4 x i32> [[MUL]]
//
int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) {
return vmulq_lane_s32(a, v, 0);
}
// CHECK-LABEL: @test_vmul_lane_u16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <4 x i16> [[MUL]]
//
uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) {
return vmul_lane_u16(a, v, 0);
}
// CHECK-LABEL: @test_vmulq_lane_u16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <8 x i16> [[MUL]]
//
uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) {
return vmulq_lane_u16(a, v, 0);
}
// CHECK-LABEL: @test_vmul_lane_u32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <2 x i32> [[MUL]]
//
uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) {
return vmul_lane_u32(a, v, 0);
}
// CHECK-LABEL: @test_vmulq_lane_u32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <4 x i32> [[MUL]]
//
uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) {
return vmulq_lane_u32(a, v, 0);
}
// CHECK-LABEL: @test_vmul_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <4 x i16> [[MUL]]
//
int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) {
return vmul_laneq_s16(a, v, 0);
}
// CHECK-LABEL: @test_vmulq_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <8 x i16> [[MUL]]
//
int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) {
return vmulq_laneq_s16(a, v, 0);
}
// CHECK-LABEL: @test_vmul_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <2 x i32> [[MUL]]
//
int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) {
return vmul_laneq_s32(a, v, 0);
}
// CHECK-LABEL: @test_vmulq_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <4 x i32> [[MUL]]
//
int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) {
return vmulq_laneq_s32(a, v, 0);
}
// CHECK-LABEL: @test_vmul_laneq_u16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <4 x i16> [[MUL]]
//
uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
return vmul_laneq_u16(a, v, 0);
}
// CHECK-LABEL: @test_vmulq_laneq_u16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <8 x i16> [[MUL]]
//
uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
return vmulq_laneq_u16(a, v, 0);
}
// CHECK-LABEL: @test_vmul_laneq_u32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <2 x i32> [[MUL]]
//
uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
return vmul_laneq_u32(a, v, 0);
}
// CHECK-LABEL: @test_vmulq_laneq_u32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <4 x i32> [[MUL]]
//
uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
return vmulq_laneq_u32(a, v, 0);
}
// CHECK-LABEL: @test_vfma_lane_f32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
// CHECK-NEXT: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
// CHECK-NEXT: ret <2 x float> [[FMLA2]]
//
float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
return vfma_lane_f32(a, b, v, 0);
}
// CHECK-LABEL: @test_vfmaq_lane_f32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
// CHECK-NEXT: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
// CHECK-NEXT: ret <4 x float> [[FMLA2]]
//
float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
return vfmaq_lane_f32(a, b, v, 0);
}
// CHECK-LABEL: @test_vfma_laneq_f32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
// CHECK-NEXT: ret <2 x float> [[TMP6]]
//
float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
return vfma_laneq_f32(a, b, v, 0);
}
// CHECK-LABEL: @test_vfmaq_laneq_f32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
// CHECK-NEXT: ret <4 x float> [[TMP6]]
//
float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
return vfmaq_laneq_f32(a, b, v, 0);
}
// CHECK-LABEL: @test_vfms_lane_f32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]]
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
// CHECK-NEXT: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
// CHECK-NEXT: ret <2 x float> [[FMLA2]]
//
float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
return vfms_lane_f32(a, b, v, 0);
}
// CHECK-LABEL: @test_vfmsq_lane_f32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]]
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
// CHECK-NEXT: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
// CHECK-NEXT: ret <4 x float> [[FMLA2]]
//
float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
return vfmsq_lane_f32(a, b, v, 0);
}
// CHECK-LABEL: @test_vfms_laneq_f32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]]
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
// CHECK-NEXT: ret <2 x float> [[TMP6]]
//
float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
return vfms_laneq_f32(a, b, v, 0);
}
// CHECK-LABEL: @test_vfmsq_laneq_f32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]]
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
// CHECK-NEXT: ret <4 x float> [[TMP6]]
//
float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
return vfmsq_laneq_f32(a, b, v, 0);
}
// CHECK-LABEL: @test_vfmaq_laneq_f64_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
// CHECK-NEXT: ret <2 x double> [[TMP6]]
//
float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
return vfmaq_laneq_f64(a, b, v, 0);
}
// CHECK-LABEL: @test_vfmsq_laneq_f64_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]]
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
// CHECK-NEXT: ret <2 x double> [[TMP6]]
//
float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
return vfmsq_laneq_f64(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlal_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
return vmlal_lane_s16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlal_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
return vmlal_lane_s32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlal_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
return vmlal_laneq_s16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlal_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
return vmlal_laneq_s32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlal_high_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
return vmlal_high_lane_s16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlal_high_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
return vmlal_high_lane_s32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlal_high_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
return vmlal_high_laneq_s16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlal_high_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
return vmlal_high_laneq_s32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlsl_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
return vmlsl_lane_s16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlsl_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
return vmlsl_lane_s32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlsl_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
return vmlsl_laneq_s16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlsl_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
return vmlsl_laneq_s32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlsl_high_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
return vmlsl_high_lane_s16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlsl_high_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
return vmlsl_high_lane_s32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlsl_high_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
return vmlsl_high_laneq_s16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlsl_high_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
return vmlsl_high_laneq_s32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlal_lane_u16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
return vmlal_lane_u16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlal_lane_u32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
return vmlal_lane_u32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlal_laneq_u16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
return vmlal_laneq_u16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlal_laneq_u32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
return vmlal_laneq_u32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlal_high_lane_u16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
return vmlal_high_lane_u16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlal_high_lane_u32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
return vmlal_high_lane_u32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlal_high_laneq_u16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
return vmlal_high_laneq_u16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlal_high_laneq_u32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
return vmlal_high_laneq_u32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlsl_lane_u16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
return vmlsl_lane_u16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlsl_lane_u32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
return vmlsl_lane_u32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlsl_laneq_u16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
return vmlsl_laneq_u16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlsl_laneq_u32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
return vmlsl_laneq_u32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlsl_high_lane_u16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
return vmlsl_high_lane_u16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlsl_high_lane_u32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
return vmlsl_high_lane_u32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlsl_high_laneq_u16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
return vmlsl_high_laneq_u16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlsl_high_laneq_u32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
return vmlsl_high_laneq_u32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmull_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
return vmull_lane_s16(a, v, 0);
}
// CHECK-LABEL: @test_vmull_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
return vmull_lane_s32(a, v, 0);
}
// CHECK-LABEL: @test_vmull_lane_u16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
return vmull_lane_u16(a, v, 0);
}
// CHECK-LABEL: @test_vmull_lane_u32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
return vmull_lane_u32(a, v, 0);
}
// CHECK-LABEL: @test_vmull_high_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
return vmull_high_lane_s16(a, v, 0);
}
// CHECK-LABEL: @test_vmull_high_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
return vmull_high_lane_s32(a, v, 0);
}
// CHECK-LABEL: @test_vmull_high_lane_u16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) {
return vmull_high_lane_u16(a, v, 0);
}
// CHECK-LABEL: @test_vmull_high_lane_u32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) {
return vmull_high_lane_u32(a, v, 0);
}
// CHECK-LABEL: @test_vmull_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
return vmull_laneq_s16(a, v, 0);
}
// CHECK-LABEL: @test_vmull_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
return vmull_laneq_s32(a, v, 0);
}
// CHECK-LABEL: @test_vmull_laneq_u16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
return vmull_laneq_u16(a, v, 0);
}
// CHECK-LABEL: @test_vmull_laneq_u32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
return vmull_laneq_u32(a, v, 0);
}
// CHECK-LABEL: @test_vmull_high_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
return vmull_high_laneq_s16(a, v, 0);
}
// CHECK-LABEL: @test_vmull_high_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
return vmull_high_laneq_s32(a, v, 0);
}
// CHECK-LABEL: @test_vmull_high_laneq_u16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
return vmull_high_laneq_u16(a, v, 0);
}
// CHECK-LABEL: @test_vmull_high_laneq_u32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
return vmull_high_laneq_u32(a, v, 0);
}
// CHECK-LABEL: @test_vqdmlal_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]]
//
int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
return vqdmlal_lane_s16(a, b, v, 0);
}
// CHECK-LABEL: @test_vqdmlal_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]]
//
int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
return vqdmlal_lane_s32(a, b, v, 0);
}
// CHECK-LABEL: @test_vqdmlal_high_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]]
//
int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
return vqdmlal_high_lane_s16(a, b, v, 0);
}
// CHECK-LABEL: @test_vqdmlal_high_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]]
//
int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
return vqdmlal_high_lane_s32(a, b, v, 0);
}
// CHECK-LABEL: @test_vqdmlsl_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]]
//
int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
return vqdmlsl_lane_s16(a, b, v, 0);
}
// CHECK-LABEL: @test_vqdmlsl_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]]
//
int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
return vqdmlsl_lane_s32(a, b, v, 0);
}
// CHECK-LABEL: @test_vqdmlsl_high_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]]
//
int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
return vqdmlsl_high_lane_s16(a, b, v, 0);
}
// CHECK-LABEL: @test_vqdmlsl_high_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]]
//
int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
return vqdmlsl_high_lane_s32(a, b, v, 0);
}
// CHECK-LABEL: @test_vqdmull_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]]
//
int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) {
return vqdmull_lane_s16(a, v, 0);
}
// CHECK-LABEL: @test_vqdmull_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]]
//
int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) {
return vqdmull_lane_s32(a, v, 0);
}
// CHECK-LABEL: @test_vqdmull_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]]
//
int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
return vqdmull_laneq_s16(a, v, 0);
}
// CHECK-LABEL: @test_vqdmull_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]]
//
int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
return vqdmull_laneq_s32(a, v, 0);
}
// CHECK-LABEL: @test_vqdmull_high_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]]
//
int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
return vqdmull_high_lane_s16(a, v, 0);
}
// CHECK-LABEL: @test_vqdmull_high_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]]
//
int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
return vqdmull_high_lane_s32(a, v, 0);
}
// CHECK-LABEL: @test_vqdmull_high_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]]
//
int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
return vqdmull_high_laneq_s16(a, v, 0);
}
// CHECK-LABEL: @test_vqdmull_high_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]]
//
int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
return vqdmull_high_laneq_s32(a, v, 0);
}
// CHECK-LABEL: @test_vqdmulh_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 0)
// CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANE_V2]]
//
int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
return vqdmulh_lane_s16(a, v, 0);
}
// CHECK-LABEL: @test_vqdmulhq_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 0)
// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANE_V2]]
//
int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
return vqdmulhq_lane_s16(a, v, 0);
}
// CHECK-LABEL: @test_vqdmulh_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 0)
// CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANE_V2]]
//
int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
return vqdmulh_lane_s32(a, v, 0);
}
// CHECK-LABEL: @test_vqdmulhq_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 0)
// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANE_V2]]
//
int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
return vqdmulhq_lane_s32(a, v, 0);
}
// CHECK-LABEL: @test_vqrdmulh_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 0)
// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANE_V2]]
//
int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
return vqrdmulh_lane_s16(a, v, 0);
}
// CHECK-LABEL: @test_vqrdmulhq_lane_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 0)
// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANE_V2]]
//
int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
return vqrdmulhq_lane_s16(a, v, 0);
}
// CHECK-LABEL: @test_vqrdmulh_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 0)
// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANE_V2]]
//
int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
return vqrdmulh_lane_s32(a, v, 0);
}
// CHECK-LABEL: @test_vqrdmulhq_lane_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 0)
// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANE_V2]]
//
int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
return vqrdmulhq_lane_s32(a, v, 0);
}
// CHECK-LABEL: @test_vmul_lane_f32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <2 x float> [[MUL]]
//
float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) {
return vmul_lane_f32(a, v, 0);
}
// CHECK-LABEL: @test_vmulq_lane_f32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <4 x float> [[MUL]]
//
float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) {
return vmulq_lane_f32(a, v, 0);
}
// CHECK-LABEL: @test_vmul_laneq_f32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <2 x float> [[MUL]]
//
float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) {
return vmul_laneq_f32(a, v, 0);
}
// CHECK-LABEL: @test_vmul_laneq_f64_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
// CHECK-NEXT: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
// CHECK-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
// CHECK-NEXT: ret <1 x double> [[TMP5]]
//
float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) {
return vmul_laneq_f64(a, v, 0);
}
// CHECK-LABEL: @test_vmulq_laneq_f32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <4 x float> [[MUL]]
//
float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) {
return vmulq_laneq_f32(a, v, 0);
}
// CHECK-LABEL: @test_vmulq_laneq_f64_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]]
// CHECK-NEXT: ret <2 x double> [[MUL]]
//
float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) {
return vmulq_laneq_f64(a, v, 0);
}
// CHECK-LABEL: @test_vmulx_lane_f32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4
// CHECK-NEXT: ret <2 x float> [[VMULX2_I]]
//
float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
return vmulx_lane_f32(a, v, 0);
}
// CHECK-LABEL: @test_vmulxq_lane_f32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4
// CHECK-NEXT: ret <4 x float> [[VMULX2_I]]
//
float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
return vmulxq_lane_f32(a, v, 0);
}
// CHECK-LABEL: @test_vmulxq_lane_f64_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4
// CHECK-NEXT: ret <2 x double> [[VMULX2_I]]
//
float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
return vmulxq_lane_f64(a, v, 0);
}
// CHECK-LABEL: @test_vmulx_laneq_f32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4
// CHECK-NEXT: ret <2 x float> [[VMULX2_I]]
//
float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
return vmulx_laneq_f32(a, v, 0);
}
// CHECK-LABEL: @test_vmulxq_laneq_f32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4
// CHECK-NEXT: ret <4 x float> [[VMULX2_I]]
//
float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
return vmulxq_laneq_f32(a, v, 0);
}
// CHECK-LABEL: @test_vmulxq_laneq_f64_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4
// CHECK-NEXT: ret <2 x double> [[VMULX2_I]]
//
float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
return vmulxq_laneq_f64(a, v, 0);
}
// CHECK-LABEL: @test_vmull_high_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[B]], i32 1
// CHECK-NEXT: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[B]], i32 2
// CHECK-NEXT: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[B]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I_I]]
//
int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) {
return vmull_high_n_s16(a, b);
}
// CHECK-LABEL: @test_vmull_high_n_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[B]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I_I]]
//
int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
return vmull_high_n_s32(a, b);
}
// CHECK-LABEL: @test_vmull_high_n_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[B]], i32 1
// CHECK-NEXT: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[B]], i32 2
// CHECK-NEXT: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[B]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I_I]]
//
uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) {
return vmull_high_n_u16(a, b);
}
// CHECK-LABEL: @test_vmull_high_n_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[B]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I_I]]
//
uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
return vmull_high_n_u32(a, b);
}
// CHECK-LABEL: @test_vqdmull_high_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[B]], i32 1
// CHECK-NEXT: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[B]], i32 2
// CHECK-NEXT: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[B]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
// CHECK-NEXT: [[VQDMULL_V2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
// CHECK-NEXT: [[VQDMULL_V3_I_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I_I_I]]
//
int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) {
return vqdmull_high_n_s16(a, b);
}
// CHECK-LABEL: @test_vqdmull_high_n_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[B]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
// CHECK-NEXT: [[VQDMULL_V2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
// CHECK-NEXT: [[VQDMULL_V3_I_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I_I]] to <16 x i8>
// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I_I_I]]
//
int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) {
return vqdmull_high_n_s32(a, b);
}
// CHECK-LABEL: @test_vmlal_high_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[C]], i32 1
// CHECK-NEXT: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[C]], i32 2
// CHECK-NEXT: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
// CHECK-NEXT: [[ADD_I_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD_I_I]]
//
int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
return vmlal_high_n_s16(a, b, c);
}
// CHECK-LABEL: @test_vmlal_high_n_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
// CHECK-NEXT: [[ADD_I_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD_I_I]]
//
int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
return vmlal_high_n_s32(a, b, c);
}
// CHECK-LABEL: @test_vmlal_high_n_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[C]], i32 1
// CHECK-NEXT: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[C]], i32 2
// CHECK-NEXT: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
// CHECK-NEXT: [[ADD_I_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD_I_I]]
//
uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
return vmlal_high_n_u16(a, b, c);
}
// CHECK-LABEL: @test_vmlal_high_n_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
// CHECK-NEXT: [[ADD_I_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD_I_I]]
//
uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
return vmlal_high_n_u32(a, b, c);
}
// CHECK-LABEL: @test_vqdmlal_high_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[C]], i32 1
// CHECK-NEXT: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[C]], i32 2
// CHECK-NEXT: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
// CHECK-NEXT: [[VQDMLAL_V3_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I_I]]) #4
// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I_I_I]]
//
int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
return vqdmlal_high_n_s16(a, b, c);
}
// CHECK-LABEL: @test_vqdmlal_high_n_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
// CHECK-NEXT: [[VQDMLAL_V3_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I_I]]) #4
// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I_I_I]]
//
int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
return vqdmlal_high_n_s32(a, b, c);
}
// CHECK-LABEL: @test_vmlsl_high_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[C]], i32 1
// CHECK-NEXT: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[C]], i32 2
// CHECK-NEXT: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB_I_I]]
//
int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
return vmlsl_high_n_s16(a, b, c);
}
// CHECK-LABEL: @test_vmlsl_high_n_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB_I_I]]
//
int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
return vmlsl_high_n_s32(a, b, c);
}
// CHECK-LABEL: @test_vmlsl_high_n_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[C]], i32 1
// CHECK-NEXT: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[C]], i32 2
// CHECK-NEXT: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB_I_I]]
//
uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
return vmlsl_high_n_u16(a, b, c);
}
// CHECK-LABEL: @test_vmlsl_high_n_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB_I_I]]
//
uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
return vmlsl_high_n_u32(a, b, c);
}
// CHECK-LABEL: @test_vqdmlsl_high_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 [[C]], i32 1
// CHECK-NEXT: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 [[C]], i32 2
// CHECK-NEXT: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #4
// CHECK-NEXT: [[VQDMLSL_V3_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I_I]]) #4
// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I_I_I]]
//
int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
return vqdmlsl_high_n_s16(a, b, c);
}
// CHECK-LABEL: @test_vqdmlsl_high_n_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #4
// CHECK-NEXT: [[VQDMLSL_V3_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I_I]]) #4
// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I_I_I]]
//
int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
return vqdmlsl_high_n_s32(a, b, c);
}
// CHECK-LABEL: @test_vmul_n_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[B]], i32 1
// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[A:%.*]], [[VECINIT1_I]]
// CHECK-NEXT: ret <2 x float> [[MUL_I]]
//
float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
return vmul_n_f32(a, b);
}
// CHECK-LABEL: @test_vmulq_n_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[B]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[B]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[B]], i32 3
// CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[A:%.*]], [[VECINIT3_I]]
// CHECK-NEXT: ret <4 x float> [[MUL_I]]
//
float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
return vmulq_n_f32(a, b);
}
// CHECK-LABEL: @test_vmulq_n_f64(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[B]], i32 1
// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x double> [[A:%.*]], [[VECINIT1_I]]
// CHECK-NEXT: ret <2 x double> [[MUL_I]]
//
float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) {
return vmulq_n_f64(a, b);
}
// CHECK-LABEL: @test_vfma_n_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float [[N:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[N]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) #4
// CHECK-NEXT: ret <2 x float> [[TMP3]]
//
float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
return vfma_n_f32(a, b, n);
}
// CHECK-LABEL: @test_vfma_n_f64(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <1 x double> undef, double [[N:%.*]], i32 0
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B]], <1 x double> [[VECINIT_I]], <1 x double> [[A]]) #4
// CHECK-NEXT: ret <1 x double> [[TMP3]]
//
float64x1_t test_vfma_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
return vfma_n_f64(a, b, n);
}
// CHECK-LABEL: @test_vfmaq_n_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[N:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[N]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[N]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[N]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) #4
// CHECK-NEXT: ret <4 x float> [[TMP3]]
//
float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
return vfmaq_n_f32(a, b, n);
}
// CHECK-LABEL: @test_vfms_n_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <2 x float> [[B:%.*]]
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float [[N:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[N]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG_I]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) #4
// CHECK-NEXT: ret <2 x float> [[TMP3]]
//
float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
return vfms_n_f32(a, b, n);
}
// CHECK-LABEL: @test_vfms_n_f64(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <1 x double> [[B:%.*]]
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <1 x double> undef, double [[N:%.*]], i32 0
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG_I]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG_I]], <1 x double> [[VECINIT_I]], <1 x double> [[A]]) #4
// CHECK-NEXT: ret <1 x double> [[TMP3]]
//
float64x1_t test_vfms_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
return vfms_n_f64(a, b, n);
}
// CHECK-LABEL: @test_vfmsq_n_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <4 x float> [[B:%.*]]
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[N:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[N]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[N]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[N]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG_I]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) #4
// CHECK-NEXT: ret <4 x float> [[TMP3]]
//
float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
return vfmsq_n_f32(a, b, n);
}
// CHECK-LABEL: @test_vmul_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[A:%.*]], [[VECINIT3_I]]
// CHECK-NEXT: ret <4 x i16> [[MUL_I]]
//
int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
return vmul_n_s16(a, b);
}
// CHECK-LABEL: @test_vmulq_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[A:%.*]], [[VECINIT7_I]]
// CHECK-NEXT: ret <8 x i16> [[MUL_I]]
//
int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
return vmulq_n_s16(a, b);
}
// CHECK-LABEL: @test_vmul_n_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[A:%.*]], [[VECINIT1_I]]
// CHECK-NEXT: ret <2 x i32> [[MUL_I]]
//
int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
return vmul_n_s32(a, b);
}
// CHECK-LABEL: @test_vmulq_n_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[A:%.*]], [[VECINIT3_I]]
// CHECK-NEXT: ret <4 x i32> [[MUL_I]]
//
int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
return vmulq_n_s32(a, b);
}
// CHECK-LABEL: @test_vmul_n_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[A:%.*]], [[VECINIT3_I]]
// CHECK-NEXT: ret <4 x i16> [[MUL_I]]
//
uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
return vmul_n_u16(a, b);
}
// CHECK-LABEL: @test_vmulq_n_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[A:%.*]], [[VECINIT7_I]]
// CHECK-NEXT: ret <8 x i16> [[MUL_I]]
//
uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
return vmulq_n_u16(a, b);
}
// CHECK-LABEL: @test_vmul_n_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[A:%.*]], [[VECINIT1_I]]
// CHECK-NEXT: ret <2 x i32> [[MUL_I]]
//
uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
return vmul_n_u32(a, b);
}
// CHECK-LABEL: @test_vmulq_n_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[A:%.*]], [[VECINIT3_I]]
// CHECK-NEXT: ret <4 x i32> [[MUL_I]]
//
uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
return vmulq_n_u32(a, b);
}
// CHECK-LABEL: @test_vmull_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #4
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]]
//
int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
return vmull_n_s16(a, b);
}
// CHECK-LABEL: @test_vmull_n_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #4
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]]
//
int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
return vmull_n_s32(a, b);
}
// CHECK-LABEL: @test_vmull_n_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #4
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]]
//
uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
return vmull_n_u16(a, b);
}
// CHECK-LABEL: @test_vmull_n_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #4
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]]
//
uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
return vmull_n_u32(a, b);
}
// CHECK-LABEL: @test_vqdmull_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #4
// CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I_I]]
//
int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
return vqdmull_n_s16(a, b);
}
// CHECK-LABEL: @test_vqdmull_n_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #4
// CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8>
// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I_I]]
//
int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
return vqdmull_n_s32(a, b);
}
// CHECK-LABEL: @test_vqdmulh_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK-NEXT: [[VQDMULH_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #4
// CHECK-NEXT: [[VQDMULH_V3_I_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I_I]] to <8 x i8>
// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I_I]]
//
int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
return vqdmulh_n_s16(a, b);
}
// CHECK-LABEL: @test_vqdmulhq_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
// CHECK-NEXT: [[VQDMULHQ_V2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]]) #4
// CHECK-NEXT: [[VQDMULHQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I_I]] to <16 x i8>
// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I_I]]
//
int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
return vqdmulhq_n_s16(a, b);
}
// CHECK-LABEL: @test_vqdmulh_n_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK-NEXT: [[VQDMULH_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #4
// CHECK-NEXT: [[VQDMULH_V3_I_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I_I]] to <8 x i8>
// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I_I]]
//
int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
return vqdmulh_n_s32(a, b);
}
// CHECK-LABEL: @test_vqdmulhq_n_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
// CHECK-NEXT: [[VQDMULHQ_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]]) #4
// CHECK-NEXT: [[VQDMULHQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I_I]]
//
int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
return vqdmulhq_n_s32(a, b);
}
// CHECK-LABEL: @test_vqrdmulh_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULH_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #4
// CHECK-NEXT: [[VQRDMULH_V3_I_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I_I]] to <8 x i8>
// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I_I]]
//
int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
return vqrdmulh_n_s16(a, b);
}
// CHECK-LABEL: @test_vqrdmulhq_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULHQ_V2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]]) #4
// CHECK-NEXT: [[VQRDMULHQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I_I]] to <16 x i8>
// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I_I]]
//
int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
return vqrdmulhq_n_s16(a, b);
}
// CHECK-LABEL: @test_vqrdmulh_n_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK-NEXT: [[VQRDMULH_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #4
// CHECK-NEXT: [[VQRDMULH_V3_I_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I_I]] to <8 x i8>
// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I_I]]
//
int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
return vqrdmulh_n_s32(a, b);
}
// CHECK-LABEL: @test_vqrdmulhq_n_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULHQ_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]]) #4
// CHECK-NEXT: [[VQRDMULHQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I_I]]
//
int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
return vqrdmulhq_n_s32(a, b);
}
// CHECK-LABEL: @test_vmla_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]]
// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[A:%.*]], [[MUL_I]]
// CHECK-NEXT: ret <4 x i16> [[ADD_I]]
//
int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
return vmla_n_s16(a, b, c);
}
// CHECK-LABEL: @test_vmlaq_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]]
// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A:%.*]], [[MUL_I]]
// CHECK-NEXT: ret <8 x i16> [[ADD_I]]
//
int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
return vmlaq_n_s16(a, b, c);
}
// CHECK-LABEL: @test_vmla_n_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]]
// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[A:%.*]], [[MUL_I]]
// CHECK-NEXT: ret <2 x i32> [[ADD_I]]
//
int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
return vmla_n_s32(a, b, c);
}
// CHECK-LABEL: @test_vmlaq_n_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]]
// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[MUL_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD_I]]
//
int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
return vmlaq_n_s32(a, b, c);
}
// CHECK-LABEL: @test_vmla_n_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]]
// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[A:%.*]], [[MUL_I]]
// CHECK-NEXT: ret <4 x i16> [[ADD_I]]
//
uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
return vmla_n_u16(a, b, c);
}
// CHECK-LABEL: @test_vmlaq_n_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]]
// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A:%.*]], [[MUL_I]]
// CHECK-NEXT: ret <8 x i16> [[ADD_I]]
//
uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
return vmlaq_n_u16(a, b, c);
}
// CHECK-LABEL: @test_vmla_n_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]]
// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[A:%.*]], [[MUL_I]]
// CHECK-NEXT: ret <2 x i32> [[ADD_I]]
//
uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
return vmla_n_u32(a, b, c);
}
// CHECK-LABEL: @test_vmlaq_n_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]]
// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[MUL_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD_I]]
//
uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
return vmlaq_n_u32(a, b, c);
}
// CHECK-LABEL: @test_vmlal_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #4
// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD_I]]
//
int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
return vmlal_n_s16(a, b, c);
}
// CHECK-LABEL: @test_vmlal_n_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #4
// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD_I]]
//
int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
return vmlal_n_s32(a, b, c);
}
// CHECK-LABEL: @test_vmlal_n_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #4
// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD_I]]
//
uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
return vmlal_n_u16(a, b, c);
}
// CHECK-LABEL: @test_vmlal_n_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #4
// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD_I]]
//
uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
return vmlal_n_u32(a, b, c);
}
// CHECK-LABEL: @test_vqdmlal_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #4
// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) #4
// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I_I]]
//
int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
return vqdmlal_n_s16(a, b, c);
}
// CHECK-LABEL: @test_vqdmlal_n_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #4
// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) #4
// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I_I]]
//
int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
return vqdmlal_n_s32(a, b, c);
}
// CHECK-LABEL: @test_vmls_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]]
// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL_I]]
// CHECK-NEXT: ret <4 x i16> [[SUB_I]]
//
int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
return vmls_n_s16(a, b, c);
}
// CHECK-LABEL: @test_vmlsq_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]]
// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL_I]]
// CHECK-NEXT: ret <8 x i16> [[SUB_I]]
//
int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
return vmlsq_n_s16(a, b, c);
}
// CHECK-LABEL: @test_vmls_n_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]]
// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL_I]]
// CHECK-NEXT: ret <2 x i32> [[SUB_I]]
//
int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
return vmls_n_s32(a, b, c);
}
// CHECK-LABEL: @test_vmlsq_n_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]]
// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB_I]]
//
int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
return vmlsq_n_s32(a, b, c);
}
// CHECK-LABEL: @test_vmls_n_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]]
// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL_I]]
// CHECK-NEXT: ret <4 x i16> [[SUB_I]]
//
uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
return vmls_n_u16(a, b, c);
}
// CHECK-LABEL: @test_vmlsq_n_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]]
// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL_I]]
// CHECK-NEXT: ret <8 x i16> [[SUB_I]]
//
uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
return vmlsq_n_u16(a, b, c);
}
// CHECK-LABEL: @test_vmls_n_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]]
// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL_I]]
// CHECK-NEXT: ret <2 x i32> [[SUB_I]]
//
uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
return vmls_n_u32(a, b, c);
}
// CHECK-LABEL: @test_vmlsq_n_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]]
// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB_I]]
//
uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
return vmlsq_n_u32(a, b, c);
}
// CHECK-LABEL: @test_vmlsl_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #4
// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB_I]]
//
int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
return vmlsl_n_s16(a, b, c);
}
// CHECK-LABEL: @test_vmlsl_n_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #4
// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB_I]]
//
int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
return vmlsl_n_s32(a, b, c);
}
// CHECK-LABEL: @test_vmlsl_n_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #4
// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB_I]]
//
uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
return vmlsl_n_u16(a, b, c);
}
// CHECK-LABEL: @test_vmlsl_n_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #4
// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB_I]]
//
uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
return vmlsl_n_u32(a, b, c);
}
// CHECK-LABEL: @test_vqdmlsl_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #4
// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) #4
// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I_I]]
//
int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
return vqdmlsl_n_s16(a, b, c);
}
// CHECK-LABEL: @test_vqdmlsl_n_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #4
// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) #4
// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I_I]]
//
int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
return vqdmlsl_n_s32(a, b, c);
}
// CHECK-LABEL: @test_vmla_lane_u16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i16> [[ADD]]
//
uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
return vmla_lane_u16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlaq_lane_u16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <8 x i16> [[ADD]]
//
uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
return vmlaq_lane_u16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmla_lane_u32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <2 x i32> [[ADD]]
//
uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
return vmla_lane_u32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlaq_lane_u32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
return vmlaq_lane_u32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmla_laneq_u16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i16> [[ADD]]
//
uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
return vmla_laneq_u16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlaq_laneq_u16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <8 x i16> [[ADD]]
//
uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
return vmlaq_laneq_u16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmla_laneq_u32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <2 x i32> [[ADD]]
//
uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
return vmla_laneq_u32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlaq_laneq_u32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
return vmlaq_laneq_u32(a, b, v, 0);
}
// CHECK-LABEL: @test_vqdmlal_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]]
//
int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
return vqdmlal_laneq_s16(a, b, v, 0);
}
// CHECK-LABEL: @test_vqdmlal_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]]
//
int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
return vqdmlal_laneq_s32(a, b, v, 0);
}
// CHECK-LABEL: @test_vqdmlal_high_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]]
//
int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
return vqdmlal_high_laneq_s16(a, b, v, 0);
}
// CHECK-LABEL: @test_vqdmlal_high_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]]
//
int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
return vqdmlal_high_laneq_s32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmls_lane_u16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i16> [[SUB]]
//
uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
return vmls_lane_u16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlsq_lane_u16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <8 x i16> [[SUB]]
//
uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
return vmlsq_lane_u16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmls_lane_u32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <2 x i32> [[SUB]]
//
uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
return vmls_lane_u32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlsq_lane_u32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
return vmlsq_lane_u32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmls_laneq_u16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i16> [[SUB]]
//
uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
return vmls_laneq_u16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlsq_laneq_u16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <8 x i16> [[SUB]]
//
uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
return vmlsq_laneq_u16(a, b, v, 0);
}
// CHECK-LABEL: @test_vmls_laneq_u32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <2 x i32> [[SUB]]
//
uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
return vmls_laneq_u32(a, b, v, 0);
}
// CHECK-LABEL: @test_vmlsq_laneq_u32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
return vmlsq_laneq_u32(a, b, v, 0);
}
// CHECK-LABEL: @test_vqdmlsl_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]]
//
int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
return vqdmlsl_laneq_s16(a, b, v, 0);
}
// CHECK-LABEL: @test_vqdmlsl_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]]
//
int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
return vqdmlsl_laneq_s32(a, b, v, 0);
}
// CHECK-LABEL: @test_vqdmlsl_high_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]]
//
int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
return vqdmlsl_high_laneq_s16(a, b, v, 0);
}
// CHECK-LABEL: @test_vqdmlsl_high_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]]
//
int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
return vqdmlsl_high_laneq_s32(a, b, v, 0);
}
// CHECK-LABEL: @test_vqdmulh_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 0)
// CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANEQ_V2]]
//
int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
return vqdmulh_laneq_s16(a, v, 0);
}
// CHECK-LABEL: @test_vqdmulhq_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 0)
// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANEQ_V2]]
//
int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
return vqdmulhq_laneq_s16(a, v, 0);
}
// CHECK-LABEL: @test_vqdmulh_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 0)
// CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANEQ_V2]]
//
int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
return vqdmulh_laneq_s32(a, v, 0);
}
// CHECK-LABEL: @test_vqdmulhq_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 0)
// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANEQ_V2]]
//
int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
return vqdmulhq_laneq_s32(a, v, 0);
}
// CHECK-LABEL: @test_vqrdmulh_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 0)
// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANEQ_V2]]
//
int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
return vqrdmulh_laneq_s16(a, v, 0);
}
// CHECK-LABEL: @test_vqrdmulhq_laneq_s16_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 0)
// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANEQ_V2]]
//
int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
return vqrdmulhq_laneq_s16(a, v, 0);
}
// CHECK-LABEL: @test_vqrdmulh_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 0)
// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANEQ_V2]]
//
int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
return vqrdmulh_laneq_s32(a, v, 0);
}
// CHECK-LABEL: @test_vqrdmulhq_laneq_s32_0(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 0)
// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANEQ_V2]]
//
int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
return vqrdmulhq_laneq_s32(a, v, 0);
}
// CHECK-LABEL: @test_vmla_lane_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i16> [[ADD]]
//
uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
return vmla_lane_u16(a, b, v, 3);
}
// CHECK-LABEL: @test_vmlaq_lane_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <8 x i16> [[ADD]]
//
uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
return vmlaq_lane_u16(a, b, v, 3);
}
// CHECK-LABEL: @test_vmla_lane_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <2 x i32> [[ADD]]
//
uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
return vmla_lane_u32(a, b, v, 1);
}
// CHECK-LABEL: @test_vmlaq_lane_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
return vmlaq_lane_u32(a, b, v, 1);
}
// CHECK-LABEL: @test_vmla_laneq_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i16> [[ADD]]
//
uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
return vmla_laneq_u16(a, b, v, 7);
}
// CHECK-LABEL: @test_vmlaq_laneq_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <8 x i16> [[ADD]]
//
uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
return vmlaq_laneq_u16(a, b, v, 7);
}
// CHECK-LABEL: @test_vmla_laneq_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <2 x i32> [[ADD]]
//
uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
return vmla_laneq_u32(a, b, v, 3);
}
// CHECK-LABEL: @test_vmlaq_laneq_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
return vmlaq_laneq_u32(a, b, v, 3);
}
// CHECK-LABEL: @test_vqdmlal_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]]
//
int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
return vqdmlal_laneq_s16(a, b, v, 7);
}
// CHECK-LABEL: @test_vqdmlal_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]]
//
int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
return vqdmlal_laneq_s32(a, b, v, 3);
}
// CHECK-LABEL: @test_vqdmlal_high_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]]
//
int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
return vqdmlal_high_laneq_s16(a, b, v, 7);
}
// CHECK-LABEL: @test_vqdmlal_high_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]]
//
int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
return vqdmlal_high_laneq_s32(a, b, v, 3);
}
// CHECK-LABEL: @test_vmls_lane_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i16> [[SUB]]
//
uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
return vmls_lane_u16(a, b, v, 3);
}
// CHECK-LABEL: @test_vmlsq_lane_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <8 x i16> [[SUB]]
//
uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
return vmlsq_lane_u16(a, b, v, 3);
}
// CHECK-LABEL: @test_vmls_lane_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <2 x i32> [[SUB]]
//
uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
return vmls_lane_u32(a, b, v, 1);
}
// CHECK-LABEL: @test_vmlsq_lane_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
return vmlsq_lane_u32(a, b, v, 1);
}
// CHECK-LABEL: @test_vmls_laneq_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i16> [[SUB]]
//
uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
return vmls_laneq_u16(a, b, v, 7);
}
// CHECK-LABEL: @test_vmlsq_laneq_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <8 x i16> [[SUB]]
//
uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
return vmlsq_laneq_u16(a, b, v, 7);
}
// CHECK-LABEL: @test_vmls_laneq_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <2 x i32> [[SUB]]
//
uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
return vmls_laneq_u32(a, b, v, 3);
}
// CHECK-LABEL: @test_vmlsq_laneq_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
return vmlsq_laneq_u32(a, b, v, 3);
}
// CHECK-LABEL: @test_vqdmlsl_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]]
//
int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
return vqdmlsl_laneq_s16(a, b, v, 7);
}
// CHECK-LABEL: @test_vqdmlsl_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]]
//
int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
return vqdmlsl_laneq_s32(a, b, v, 3);
}
// CHECK-LABEL: @test_vqdmlsl_high_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]]
//
int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
return vqdmlsl_high_laneq_s16(a, b, v, 7);
}
// CHECK-LABEL: @test_vqdmlsl_high_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4
// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4
// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]]
//
int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
return vqdmlsl_high_laneq_s32(a, b, v, 3);
}
// CHECK-LABEL: @test_vqdmulh_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 7)
// CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANEQ_V2]]
//
int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
return vqdmulh_laneq_s16(a, v, 7);
}
// CHECK-LABEL: @test_vqdmulhq_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 7)
// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANEQ_V2]]
//
int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
return vqdmulhq_laneq_s16(a, v, 7);
}
// CHECK-LABEL: @test_vqdmulh_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 3)
// CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANEQ_V2]]
//
int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
return vqdmulh_laneq_s32(a, v, 3);
}
// CHECK-LABEL: @test_vqdmulhq_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 3)
// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANEQ_V2]]
//
int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
return vqdmulhq_laneq_s32(a, v, 3);
}
// CHECK-LABEL: @test_vqrdmulh_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 7)
// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANEQ_V2]]
//
int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
return vqrdmulh_laneq_s16(a, v, 7);
}
// CHECK-LABEL: @test_vqrdmulhq_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 7)
// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANEQ_V2]]
//
int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
return vqrdmulhq_laneq_s16(a, v, 7);
}
// CHECK-LABEL: @test_vqrdmulh_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 3)
// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANEQ_V2]]
//
int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
return vqrdmulh_laneq_s32(a, v, 3);
}
// CHECK-LABEL: @test_vqrdmulhq_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
[AArch64] Add IR intrinsics for sq(r)dmulh_lane(q) Summary: Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h), are represented in LLVM IR as a (by vector) sqdmulh and a vector of (repeated) indices, like so: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) When %v's values are known, the shufflevector is optimized away and we are no longer able to select the lane variant of sqdmulh in the backend. This defeats a (hand-coded) optimization that packs several constants into a single vector and uses the lane intrinsics to reduce register pressure and trade-off materialising several constants for a single vector load from the constant pool, like so: int16x8_t v = {2,3,4,5,6,7,8,9}; a = vqdmulh_laneq_s16(a, v, 0); b = vqdmulh_laneq_s16(b, v, 1); c = vqdmulh_laneq_s16(c, v, 2); d = vqdmulh_laneq_s16(d, v, 3); [...] In one microbenchmark from libjpeg-turbo this accounts for a 2.5% to 4% performance difference. We could teach the compiler to recover the lane variants, but this would likely require its own pass. (Alternatively, "volatile" could be used on the constants vector, but this is a bit ugly.) This patch instead implements the following LLVM IR intrinsics for AArch64 to maintain the original structure through IR optmization and into instruction selection: - sqdmulh_lane - sqdmulh_laneq - sqrdmulh_lane - sqrdmulh_laneq. These 'lane' variants need an additional register class. The second argument must be in the lower half of the 64-bit NEON register file, but only when operating on i16 elements. Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane (etc.) remain, so code that does not rely on NEON intrinsics to generate these instructions is not affected. This patch also changes clang to emit these IR intrinsics for the corresponding NEON intrinsics (AArch64 only). Reviewers: SjoerdMeijer, dmgreen, t.p.northover, rovka, rengolin, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, jdoerfert, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D71469
2020-01-29 21:07:15 +08:00
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 3)
// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANEQ_V2]]
//
int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
return vqrdmulhq_laneq_s32(a, v, 3);
}