forked from OSchip/llvm-project
[AArch64]: BFloat Load/Store Intrinsics&CodeGen
This patch upstreams support for ld / st variants of BFloat intrinsics in from __bf16 to AArch64. This includes IR intrinsics. Unittests are provided as needed. This patch is part of a series implementing the Bfloat16 extension of the Armv8.6-a architecture, as detailed here: https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/arm-architecture-developments-armv8-6-a The bfloat type, and its properties are specified in the Arm Architecture Reference Manual: https://developer.arm.com/docs/ddi0487/latest/arm-architecture-reference-manual-armv8-for-armv8-a-architecture-profile The following people contributed to this patch: - Luke Geeson - Momchil Velikov - Luke Cheeseman Reviewers: fpetrogalli, SjoerdMeijer, sdesmalen, t.p.northover, stuij Reviewed By: stuij Subscribers: arsenm, pratlucas, simon_tatham, labrinea, kristof.beyls, hiraditya, danielkiss, cfe-commits, llvm-commits, pbarrio, stuij Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D80716 Change-Id: I22e1dca2a8a9ec25d1e4f4b200cb50ea493d2575
This commit is contained in:
parent
e830fa260d
commit
508a4764c0
|
@ -1887,6 +1887,39 @@ let ArchGuard = "defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC)" in {
|
|||
def VSET_LANE_BF : IOpInst<"vset_lane", ".1.I", "b", OP_SCALAR_BF16_SET_LN>;
|
||||
def VGET_LANEQ_BF : IOpInst<"vget_lane", "1.I", "Qb", OP_SCALAR_BF16_GET_LNQ>;
|
||||
def VSET_LANEQ_BF : IOpInst<"vset_lane", ".1.I", "Qb", OP_SCALAR_BF16_SET_LNQ>;
|
||||
|
||||
def VLD1_BF : WInst<"vld1", ".(c*!)", "bQb">;
|
||||
def VLD2_BF : WInst<"vld2", "2(c*!)", "bQb">;
|
||||
def VLD3_BF : WInst<"vld3", "3(c*!)", "bQb">;
|
||||
def VLD4_BF : WInst<"vld4", "4(c*!)", "bQb">;
|
||||
|
||||
def VST1_BF : WInst<"vst1", "v*(.!)", "bQb">;
|
||||
def VST2_BF : WInst<"vst2", "v*(2!)", "bQb">;
|
||||
def VST3_BF : WInst<"vst3", "v*(3!)", "bQb">;
|
||||
def VST4_BF : WInst<"vst4", "v*(4!)", "bQb">;
|
||||
|
||||
def VLD1_X2_BF : WInst<"vld1_x2", "2(c*!)", "bQb">;
|
||||
def VLD1_X3_BF : WInst<"vld1_x3", "3(c*!)", "bQb">;
|
||||
def VLD1_X4_BF : WInst<"vld1_x4", "4(c*!)", "bQb">;
|
||||
|
||||
def VST1_X2_BF : WInst<"vst1_x2", "v*(2!)", "bQb">;
|
||||
def VST1_X3_BF : WInst<"vst1_x3", "v*(3!)", "bQb">;
|
||||
def VST1_X4_BF : WInst<"vst1_x4", "v*(4!)", "bQb">;
|
||||
|
||||
def VLD1_LANE_BF : WInst<"vld1_lane", ".(c*!).I", "bQb">;
|
||||
def VLD2_LANE_BF : WInst<"vld2_lane", "2(c*!)2I", "bQb">;
|
||||
def VLD3_LANE_BF : WInst<"vld3_lane", "3(c*!)3I", "bQb">;
|
||||
def VLD4_LANE_BF : WInst<"vld4_lane", "4(c*!)4I", "bQb">;
|
||||
def VST1_LANE_BF : WInst<"vst1_lane", "v*(.!)I", "bQb">;
|
||||
def VST2_LANE_BF : WInst<"vst2_lane", "v*(2!)I", "bQb">;
|
||||
def VST3_LANE_BF : WInst<"vst3_lane", "v*(3!)I", "bQb">;
|
||||
def VST4_LANE_BF : WInst<"vst4_lane", "v*(4!)I", "bQb">;
|
||||
|
||||
def VLD1_DUP_BF : WInst<"vld1_dup", ".(c*!)", "bQb">;
|
||||
def VLD2_DUP_BF : WInst<"vld2_dup", "2(c*!)", "bQb">;
|
||||
def VLD3_DUP_BF : WInst<"vld3_dup", "3(c*!)", "bQb">;
|
||||
def VLD4_DUP_BF : WInst<"vld4_dup", "4(c*!)", "bQb">;
|
||||
|
||||
}
|
||||
|
||||
let ArchGuard = "defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && !defined(__aarch64__)" in {
|
||||
|
|
|
@ -0,0 +1,415 @@
|
|||
// RUN: %clang_cc1 -triple aarch64-arm-none-eabi -target-feature +neon -target-feature +bf16 \
|
||||
// RUN: -O2 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK64
|
||||
// RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -target-feature +bf16 -mfloat-abi hard \
|
||||
// RUN: -O2 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK32
|
||||
|
||||
#include "arm_neon.h"
|
||||
|
||||
bfloat16x4_t test_vld1_bf16(bfloat16_t const *ptr) {
|
||||
return vld1_bf16(ptr);
|
||||
}
|
||||
// CHECK-LABEL: test_vld1_bf16
|
||||
// CHECK64: %1 = load <4 x bfloat>, <4 x bfloat>* %0
|
||||
// CHECK64-NEXT: ret <4 x bfloat> %1
|
||||
// CHECK32: %1 = load <4 x bfloat>, <4 x bfloat>* %0, align 2
|
||||
// CHECK32-NEXT: ret <4 x bfloat> %1
|
||||
|
||||
bfloat16x8_t test_vld1q_bf16(bfloat16_t const *ptr) {
|
||||
return vld1q_bf16(ptr);
|
||||
}
|
||||
// CHECK-LABEL: test_vld1q_bf16
|
||||
// CHECK64: %1 = load <8 x bfloat>, <8 x bfloat>* %0
|
||||
// CHECK64-NEXT: ret <8 x bfloat> %1
|
||||
// CHECK32: %1 = load <8 x bfloat>, <8 x bfloat>* %0, align 2
|
||||
// CHECK32-NEXT: ret <8 x bfloat> %1
|
||||
|
||||
bfloat16x4_t test_vld1_lane_bf16(bfloat16_t const *ptr, bfloat16x4_t src) {
|
||||
return vld1_lane_bf16(ptr, src, 0);
|
||||
}
|
||||
// CHECK-LABEL: test_vld1_lane_bf16
|
||||
// CHECK64: %0 = load bfloat, bfloat* %ptr, align 2
|
||||
// CHECK64-NEXT: %vld1_lane = insertelement <4 x bfloat> %src, bfloat %0, i32 0
|
||||
// CHECK64-NEXT: ret <4 x bfloat> %vld1_lane
|
||||
// CHECK32: %0 = load bfloat, bfloat* %ptr, align 2
|
||||
// CHECK32-NEXT: %vld1_lane = insertelement <4 x bfloat> %src, bfloat %0, i32 0
|
||||
// CHECK32-NEXT: ret <4 x bfloat> %vld1_lane
|
||||
|
||||
bfloat16x8_t test_vld1q_lane_bf16(bfloat16_t const *ptr, bfloat16x8_t src) {
|
||||
return vld1q_lane_bf16(ptr, src, 7);
|
||||
}
|
||||
// CHECK-LABEL: test_vld1q_lane_bf16
|
||||
// CHECK64: %0 = load bfloat, bfloat* %ptr, align 2
|
||||
// CHECK64-NEXT: %vld1_lane = insertelement <8 x bfloat> %src, bfloat %0, i32 7
|
||||
// CHECK64-NEXT: ret <8 x bfloat> %vld1_lane
|
||||
// CHECK32: %0 = load bfloat, bfloat* %ptr, align 2
|
||||
// CHECK32-NEXT: %vld1_lane = insertelement <8 x bfloat> %src, bfloat %0, i32 7
|
||||
// CHECK32-NEXT: ret <8 x bfloat> %vld1_lane
|
||||
|
||||
bfloat16x4_t test_vld1_dup_bf16(bfloat16_t const *ptr) {
|
||||
return vld1_dup_bf16(ptr);
|
||||
}
|
||||
// CHECK-LABEL: test_vld1_dup_bf16
|
||||
// CHECK64: %0 = load bfloat, bfloat* %ptr, align 2
|
||||
// CHECK64-NEXT: %1 = insertelement <4 x bfloat> undef, bfloat %0, i32 0
|
||||
// CHECK64-NEXT: %lane = shufflevector <4 x bfloat> %1, <4 x bfloat> undef, <4 x i32> zeroinitializer
|
||||
// CHECK64-NEXT: ret <4 x bfloat> %lane
|
||||
// CHECK32: %0 = load bfloat, bfloat* %ptr, align 2
|
||||
// CHECK32-NEXT: %1 = insertelement <4 x bfloat> undef, bfloat %0, i32 0
|
||||
// CHECK32-NEXT: %lane = shufflevector <4 x bfloat> %1, <4 x bfloat> undef, <4 x i32> zeroinitializer
|
||||
// CHECK32-NEXT: ret <4 x bfloat> %lane
|
||||
|
||||
bfloat16x4x2_t test_vld1_bf16_x2(bfloat16_t const *ptr) {
|
||||
return vld1_bf16_x2(ptr);
|
||||
}
|
||||
// CHECK-LABEL: test_vld1_bf16_x2
|
||||
// CHECK64: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x2.v4bf16.p0bf16(bfloat* %ptr)
|
||||
// CHECK32: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0bf16(bfloat* %ptr)
|
||||
|
||||
bfloat16x8x2_t test_vld1q_bf16_x2(bfloat16_t const *ptr) {
|
||||
return vld1q_bf16_x2(ptr);
|
||||
}
|
||||
// CHECK-LABEL: test_vld1q_bf16_x2
|
||||
// CHECK64: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x2.v8bf16.p0bf16(bfloat* %ptr)
|
||||
// CHECK32: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0bf16(bfloat* %ptr)
|
||||
|
||||
bfloat16x4x3_t test_vld1_bf16_x3(bfloat16_t const *ptr) {
|
||||
return vld1_bf16_x3(ptr);
|
||||
}
|
||||
// CHECK-LABEL: test_vld1_bf16_x3
|
||||
// CHECK64: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x3.v4bf16.p0bf16(bfloat* %ptr)
|
||||
// CHECK32: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0bf16(bfloat* %ptr)
|
||||
|
||||
bfloat16x8x3_t test_vld1q_bf16_x3(bfloat16_t const *ptr) {
|
||||
return vld1q_bf16_x3(ptr);
|
||||
}
|
||||
// CHECK-LABEL: test_vld1q_bf16_x3
|
||||
// CHECK64: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x3.v8bf16.p0bf16(bfloat* %ptr)
|
||||
// CHECK32: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0bf16(bfloat* %ptr)
|
||||
|
||||
bfloat16x4x4_t test_vld1_bf16_x4(bfloat16_t const *ptr) {
|
||||
return vld1_bf16_x4(ptr);
|
||||
}
|
||||
// CHECK-LABEL: test_vld1_bf16_x4
|
||||
// CHECK64: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x4.v4bf16.p0bf16(bfloat* %ptr)
|
||||
// CHECK32: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0bf16(bfloat* %ptr)
|
||||
|
||||
bfloat16x8x4_t test_vld1q_bf16_x4(bfloat16_t const *ptr) {
|
||||
return vld1q_bf16_x4(ptr);
|
||||
}
|
||||
// CHECK-LABEL: test_vld1q_bf16_x4
|
||||
// CHECK64: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x4.v8bf16.p0bf16(bfloat* %ptr)
|
||||
// CHECK32: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0bf16(bfloat* %ptr)
|
||||
|
||||
bfloat16x8_t test_vld1q_dup_bf16(bfloat16_t const *ptr) {
|
||||
return vld1q_dup_bf16(ptr);
|
||||
}
|
||||
// CHECK-LABEL: test_vld1q_dup_bf16
|
||||
// CHECK64: %0 = load bfloat, bfloat* %ptr, align 2
|
||||
// CHECK64-NEXT: %1 = insertelement <8 x bfloat> undef, bfloat %0, i32 0
|
||||
// CHECK64-NEXT: %lane = shufflevector <8 x bfloat> %1, <8 x bfloat> undef, <8 x i32> zeroinitializer
|
||||
// CHECK64-NEXT: ret <8 x bfloat> %lane
|
||||
// CHECK32: %0 = load bfloat, bfloat* %ptr, align 2
|
||||
// CHECK32-NEXT: %1 = insertelement <8 x bfloat> undef, bfloat %0, i32 0
|
||||
// CHECK32-NEXT: %lane = shufflevector <8 x bfloat> %1, <8 x bfloat> undef, <8 x i32> zeroinitializer
|
||||
// CHECK32-NEXT: ret <8 x bfloat> %lane
|
||||
|
||||
bfloat16x4x2_t test_vld2_bf16(bfloat16_t const *ptr) {
|
||||
return vld2_bf16(ptr);
|
||||
}
|
||||
// CHECK-LABEL: test_vld2_bf16
|
||||
// CHECK64: %0 = bitcast bfloat* %ptr to <4 x bfloat>*
|
||||
// CHECK64-NEXT: %vld2 = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2.v4bf16.p0v4bf16(<4 x bfloat>* %0)
|
||||
// CHECK32: %0 = bitcast bfloat* %ptr to i8*
|
||||
// CHECK32-NEXT: %vld2_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0i8(i8* %0, i32 2)
|
||||
|
||||
bfloat16x8x2_t test_vld2q_bf16(bfloat16_t const *ptr) {
|
||||
return vld2q_bf16(ptr);
|
||||
}
|
||||
// CHECK-LABEL: test_vld2q_bf16
|
||||
// CHECK64: %0 = bitcast bfloat* %ptr to <8 x bfloat>*
|
||||
// CHECK64-NEXT: %vld2 = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2.v8bf16.p0v8bf16(<8 x bfloat>* %0)
|
||||
// CHECK32: %0 = bitcast bfloat* %ptr to i8*
|
||||
// CHECK32-NEXT: %vld2q_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0i8(i8* %0, i32 2)
|
||||
|
||||
bfloat16x4x2_t test_vld2_lane_bf16(bfloat16_t const *ptr, bfloat16x4x2_t src) {
|
||||
return vld2_lane_bf16(ptr, src, 1);
|
||||
}
|
||||
// CHECK-LABEL: test_vld2_lane_bf16
|
||||
// CHECK64: %vld2_lane = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, i64 1, i8* %0)
|
||||
// CHECK32: %vld2_lane_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0i8(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 1, i32 2)
|
||||
|
||||
bfloat16x8x2_t test_vld2q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x2_t src) {
|
||||
return vld2q_lane_bf16(ptr, src, 7);
|
||||
}
|
||||
// CHECK-LABEL: test_vld2q_lane_bf16
|
||||
// CHECK64: %vld2_lane = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, i64 7, i8* %0)
|
||||
// CHECK32: %vld2q_lane_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0i8(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 7, i32 2)
|
||||
|
||||
bfloat16x4x3_t test_vld3_bf16(bfloat16_t const *ptr) {
|
||||
return vld3_bf16(ptr);
|
||||
}
|
||||
// CHECK-LABEL: test_vld3_bf16
|
||||
// CHECK64: %vld3 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3.v4bf16.p0v4bf16(<4 x bfloat>* %0)
|
||||
// CHECK32: %0 = bitcast bfloat* %ptr to i8*
|
||||
// CHECK32-NEXT: %vld3_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0i8(i8* %0, i32 2)
|
||||
|
||||
bfloat16x8x3_t test_vld3q_bf16(bfloat16_t const *ptr) {
|
||||
return vld3q_bf16(ptr);
|
||||
}
|
||||
// CHECK-LABEL: test_vld3q_bf16
|
||||
// CHECK64: %vld3 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3.v8bf16.p0v8bf16(<8 x bfloat>* %0)
|
||||
// CHECK32: %0 = bitcast bfloat* %ptr to i8*
|
||||
// CHECK32-NEXT: %vld3q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0i8(i8* %0, i32 2)
|
||||
|
||||
bfloat16x4x3_t test_vld3_lane_bf16(bfloat16_t const *ptr, bfloat16x4x3_t src) {
|
||||
return vld3_lane_bf16(ptr, src, 1);
|
||||
}
|
||||
// CHECK-LABEL: test_vld3_lane_bf16
|
||||
// CHECK64: %vld3_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, i64 1, i8* %0)
|
||||
// CHECK32: %3 = bitcast bfloat* %ptr to i8*
|
||||
// CHECK32-NEXT: %vld3_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0i8(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2)
|
||||
|
||||
bfloat16x8x3_t test_vld3q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x3_t src) {
|
||||
return vld3q_lane_bf16(ptr, src, 7);
|
||||
// return vld3q_lane_bf16(ptr, src, 8);
|
||||
}
|
||||
// CHECK-LABEL: test_vld3q_lane_bf16
|
||||
// CHECK64: %vld3_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, i64 7, i8* %0)
|
||||
// CHECK32: %3 = bitcast bfloat* %ptr to i8*
|
||||
// CHECK32-NEXT: %vld3q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0i8(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2)
|
||||
|
||||
bfloat16x4x4_t test_vld4_bf16(bfloat16_t const *ptr) {
|
||||
return vld4_bf16(ptr);
|
||||
}
|
||||
// CHECK-LABEL: test_vld4_bf16
|
||||
// CHECK64: %vld4 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4.v4bf16.p0v4bf16(<4 x bfloat>* %0)
|
||||
// CHECK32: %0 = bitcast bfloat* %ptr to i8*
|
||||
// CHECK32-NEXT: %vld4_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4.v4bf16.p0i8(i8* %0, i32 2)
|
||||
|
||||
bfloat16x8x4_t test_vld4q_bf16(bfloat16_t const *ptr) {
|
||||
return vld4q_bf16(ptr);
|
||||
}
|
||||
// CHECK-LABEL: test_vld4q_bf16
|
||||
// CHECK64: %vld4 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4.v8bf16.p0v8bf16(<8 x bfloat>* %0)
|
||||
// CHECK32: %0 = bitcast bfloat* %ptr to i8*
|
||||
// CHECK32-NEXT: %vld4q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4.v8bf16.p0i8(i8* %0, i32 2)
|
||||
|
||||
bfloat16x4x4_t test_vld4_lane_bf16(bfloat16_t const *ptr, bfloat16x4x4_t src) {
|
||||
return vld4_lane_bf16(ptr, src, 1);
|
||||
}
|
||||
// CHECK-LABEL: test_vld4_lane_bf16
|
||||
// CHECK64: %vld4_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, <4 x bfloat> %src.coerce.fca.3.extract, i64 1, i8* %0)
|
||||
// CHECK32: %4 = bitcast bfloat* %ptr to i8*
|
||||
// CHECK32-NEXT: %vld4_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0i8(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2)
|
||||
|
||||
bfloat16x8x4_t test_vld4q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x4_t src) {
|
||||
return vld4q_lane_bf16(ptr, src, 7);
|
||||
}
|
||||
// CHECK-LABEL: test_vld4q_lane_bf16
|
||||
// CHECK64: %vld4_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, <8 x bfloat> %src.coerce.fca.3.extract, i64 7, i8* %0)
|
||||
// CHECK32: %4 = bitcast bfloat* %ptr to i8*
|
||||
// CHECK32-NEXT: %vld4q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0i8(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2)
|
||||
|
||||
bfloat16x4x2_t test_vld2_dup_bf16(bfloat16_t const *ptr) {
|
||||
return vld2_dup_bf16(ptr);
|
||||
}
|
||||
// CHECK-LABEL: test_vld2_dup_bf16
|
||||
// CHECK64: %vld2 = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2r.v4bf16.p0bf16(bfloat* %ptr)
|
||||
// CHECK32: %vld2_dup_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0i8(i8* %0, i32 2)
|
||||
|
||||
bfloat16x8x2_t test_vld2q_dup_bf16(bfloat16_t const *ptr) {
|
||||
return vld2q_dup_bf16(ptr);
|
||||
}
|
||||
// CHECK-LABEL: test_vld2q_dup_bf16
|
||||
// CHECK64: %vld2 = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2r.v8bf16.p0bf16(bfloat* %ptr)
|
||||
// CHECK32: %vld2q_dup_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0i8(i8* %0, i32 2)
|
||||
|
||||
bfloat16x4x3_t test_vld3_dup_bf16(bfloat16_t const *ptr) {
|
||||
return vld3_dup_bf16(ptr);
|
||||
}
|
||||
// CHECK-LABEL: test_vld3_dup_bf16
|
||||
// CHECK64: %vld3 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3r.v4bf16.p0bf16(bfloat* %ptr)
|
||||
// CHECK32: %vld3_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16.p0i8(i8* %0, i32 2)
|
||||
|
||||
bfloat16x8x3_t test_vld3q_dup_bf16(bfloat16_t const *ptr) {
|
||||
return vld3q_dup_bf16(ptr);
|
||||
}
|
||||
// CHECK-LABEL: test_vld3q_dup_bf16
|
||||
// CHECK64: %vld3 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3r.v8bf16.p0bf16(bfloat* %ptr)
|
||||
// CHECK32: %vld3q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf16.p0i8(i8* %0, i32 2)
|
||||
|
||||
bfloat16x4x4_t test_vld4_dup_bf16(bfloat16_t const *ptr) {
|
||||
return vld4_dup_bf16(ptr);
|
||||
}
|
||||
// CHECK-LABEL: test_vld4_dup_bf16
|
||||
// CHECK64: %vld4 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4r.v4bf16.p0bf16(bfloat* %ptr)
|
||||
// CHECK32: %vld4_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4dup.v4bf16.p0i8(i8* %0, i32 2)
|
||||
|
||||
bfloat16x8x4_t test_vld4q_dup_bf16(bfloat16_t const *ptr) {
|
||||
return vld4q_dup_bf16(ptr);
|
||||
}
|
||||
// CHECK-LABEL: test_vld4q_dup_bf16
|
||||
// CHECK64: %vld4 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4r.v8bf16.p0bf16(bfloat* %ptr)
|
||||
// CHECK32: %vld4q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4dup.v8bf16.p0i8(i8* %0, i32 2)
|
||||
|
||||
void test_vst1_bf16(bfloat16_t *ptr, bfloat16x4_t val) {
|
||||
vst1_bf16(ptr, val);
|
||||
}
|
||||
// CHECK-LABEL: test_vst1_bf16
|
||||
// CHECK64: %0 = bitcast bfloat* %ptr to <4 x bfloat>*
|
||||
// CHECK64-NEXT: store <4 x bfloat> %val, <4 x bfloat>* %0, align 2
|
||||
// CHECK32: %0 = bitcast bfloat* %ptr to i8*
|
||||
// CHECK32-NEXT: tail call void @llvm.arm.neon.vst1.p0i8.v4bf16(i8* %0, <4 x bfloat> %val, i32 2)
|
||||
|
||||
void test_vst1q_bf16(bfloat16_t *ptr, bfloat16x8_t val) {
|
||||
vst1q_bf16(ptr, val);
|
||||
}
|
||||
// CHECK-LABEL: test_vst1q_bf16
|
||||
// CHECK64: %0 = bitcast bfloat* %ptr to <8 x bfloat>*
|
||||
// CHECK64-NEXT: store <8 x bfloat> %val, <8 x bfloat>* %0, align 2
|
||||
// CHECK32: %0 = bitcast bfloat* %ptr to i8*
|
||||
// CHECK32-NEXT: tail call void @llvm.arm.neon.vst1.p0i8.v8bf16(i8* %0, <8 x bfloat> %val, i32 2)
|
||||
|
||||
void test_vst1_lane_bf16(bfloat16_t *ptr, bfloat16x4_t val) {
|
||||
vst1_lane_bf16(ptr, val, 1);
|
||||
}
|
||||
// CHECK-LABEL: test_vst1_lane_bf16
|
||||
// CHECK64: %0 = extractelement <4 x bfloat> %val, i32 1
|
||||
// CHECK64-NEXT: store bfloat %0, bfloat* %ptr, align 2
|
||||
// CHECK32: %0 = extractelement <4 x bfloat> %val, i32 1
|
||||
// CHECK32-NEXT: store bfloat %0, bfloat* %ptr, align 2
|
||||
|
||||
void test_vst1q_lane_bf16(bfloat16_t *ptr, bfloat16x8_t val) {
|
||||
vst1q_lane_bf16(ptr, val, 7);
|
||||
}
|
||||
// CHECK-LABEL: test_vst1q_lane_bf16
|
||||
// CHECK64: %0 = extractelement <8 x bfloat> %val, i32 7
|
||||
// CHECK64-NEXT: store bfloat %0, bfloat* %ptr, align 2
|
||||
// CHECK32: %0 = extractelement <8 x bfloat> %val, i32 7
|
||||
// CHECK32-NEXT: store bfloat %0, bfloat* %ptr, align 2
|
||||
|
||||
void test_vst1_bf16_x2(bfloat16_t *ptr, bfloat16x4x2_t val) {
|
||||
vst1_bf16_x2(ptr, val);
|
||||
}
|
||||
// CHECK-LABEL: test_vst1_bf16_x2
|
||||
// CHECK64: tail call void @llvm.aarch64.neon.st1x2.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, bfloat* %ptr)
|
||||
// CHECK32: tail call void @llvm.arm.neon.vst1x2.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1)
|
||||
|
||||
void test_vst1q_bf16_x2(bfloat16_t *ptr, bfloat16x8x2_t val) {
|
||||
vst1q_bf16_x2(ptr, val);
|
||||
}
|
||||
// CHECK-LABEL: test_vst1q_bf16_x2
|
||||
// CHECK64: tail call void @llvm.aarch64.neon.st1x2.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, bfloat* %ptr)
|
||||
// CHECK32: tail call void @llvm.arm.neon.vst1x2.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1)
|
||||
|
||||
void test_vst1_bf16_x3(bfloat16_t *ptr, bfloat16x4x3_t val) {
|
||||
vst1_bf16_x3(ptr, val);
|
||||
}
|
||||
// CHECK-LABEL: test_vst1_bf16_x3
|
||||
// CHECK64: tail call void @llvm.aarch64.neon.st1x3.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, bfloat* %ptr)
|
||||
// CHECK32: tail call void @llvm.arm.neon.vst1x3.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2)
|
||||
|
||||
void test_vst1q_bf16_x3(bfloat16_t *ptr, bfloat16x8x3_t val) {
|
||||
vst1q_bf16_x3(ptr, val);
|
||||
}
|
||||
// CHECK-LABEL: test_vst1q_bf16_x3
|
||||
// CHECK64: tail call void @llvm.aarch64.neon.st1x3.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, bfloat* %ptr)
|
||||
// CHECK32: tail call void @llvm.arm.neon.vst1x3.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2)
|
||||
|
||||
void test_vst1_bf16_x4(bfloat16_t *ptr, bfloat16x4x4_t val) {
|
||||
vst1_bf16_x4(ptr, val);
|
||||
}
|
||||
// CHECK-LABEL: test_vst1_bf16_x4
|
||||
// CHECK64: tail call void @llvm.aarch64.neon.st1x4.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, bfloat* %ptr)
|
||||
// CHECK32: tail call void @llvm.arm.neon.vst1x4.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3)
|
||||
|
||||
void test_vst1q_bf16_x4(bfloat16_t *ptr, bfloat16x8x4_t val) {
|
||||
vst1q_bf16_x4(ptr, val);
|
||||
}
|
||||
// CHECK-LABEL: test_vst1q_bf16_x4
|
||||
// CHECK64: tail call void @llvm.aarch64.neon.st1x4.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, bfloat* %ptr)
|
||||
// CHECK32: tail call void @llvm.arm.neon.vst1x4.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3)
|
||||
|
||||
void test_vst2_bf16(bfloat16_t *ptr, bfloat16x4x2_t val) {
|
||||
vst2_bf16(ptr, val);
|
||||
}
|
||||
// CHECK-LABEL: test_vst2_bf16
|
||||
// CHECK64: tail call void @llvm.aarch64.neon.st2.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, i8* %0)
|
||||
// CHECK32: tail call void @llvm.arm.neon.vst2.p0i8.v4bf16(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 2)
|
||||
|
||||
void test_vst2q_bf16(bfloat16_t *ptr, bfloat16x8x2_t val) {
|
||||
vst2q_bf16(ptr, val);
|
||||
}
|
||||
// CHECK-LABEL: test_vst2q_bf16
|
||||
// CHECK64: tail call void @llvm.aarch64.neon.st2.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, i8* %0)
|
||||
// CHECK32: tail call void @llvm.arm.neon.vst2.p0i8.v8bf16(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 2)
|
||||
|
||||
void test_vst2_lane_bf16(bfloat16_t *ptr, bfloat16x4x2_t val) {
|
||||
vst2_lane_bf16(ptr, val, 1);
|
||||
}
|
||||
// CHECK-LABEL: test_vst2_lane_bf16
|
||||
// CHECK64: tail call void @llvm.aarch64.neon.st2lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, i64 1, i8* %0)
|
||||
// CHECK32: tail call void @llvm.arm.neon.vst2lane.p0i8.v4bf16(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 1, i32 2)
|
||||
|
||||
void test_vst2q_lane_bf16(bfloat16_t *ptr, bfloat16x8x2_t val) {
|
||||
vst2q_lane_bf16(ptr, val, 7);
|
||||
}
|
||||
// CHECK-LABEL: test_vst2q_lane_bf16
|
||||
// CHECK64: tail call void @llvm.aarch64.neon.st2lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, i64 7, i8* %0)
|
||||
// CHECK32: tail call void @llvm.arm.neon.vst2lane.p0i8.v8bf16(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 7, i32 2)
|
||||
|
||||
void test_vst3_bf16(bfloat16_t *ptr, bfloat16x4x3_t val) {
|
||||
vst3_bf16(ptr, val);
|
||||
}
|
||||
// CHECK-LABEL: test_vst3_bf16
|
||||
// CHECK64: tail call void @llvm.aarch64.neon.st3.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, i8* %0)
|
||||
// CHECK32: tail call void @llvm.arm.neon.vst3.p0i8.v4bf16(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 2)
|
||||
|
||||
void test_vst3q_bf16(bfloat16_t *ptr, bfloat16x8x3_t val) {
|
||||
vst3q_bf16(ptr, val);
|
||||
}
|
||||
// CHECK-LABEL: test_vst3q_bf16
|
||||
// CHECK64: tail call void @llvm.aarch64.neon.st3.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, i8* %0)
|
||||
// CHECK32: tail call void @llvm.arm.neon.vst3.p0i8.v8bf16(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 2)
|
||||
|
||||
void test_vst3_lane_bf16(bfloat16_t *ptr, bfloat16x4x3_t val) {
|
||||
vst3_lane_bf16(ptr, val, 1);
|
||||
}
|
||||
// CHECK-LABEL: test_vst3_lane_bf16
|
||||
// CHECK64: tail call void @llvm.aarch64.neon.st3lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, i64 1, i8* %0)
|
||||
// CHECK32: tail call void @llvm.arm.neon.vst3lane.p0i8.v4bf16(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2)
|
||||
|
||||
void test_vst3q_lane_bf16(bfloat16_t *ptr, bfloat16x8x3_t val) {
|
||||
vst3q_lane_bf16(ptr, val, 7);
|
||||
}
|
||||
// CHECK-LABEL: test_vst3q_lane_bf16
|
||||
// CHECK64: tail call void @llvm.aarch64.neon.st3lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, i64 7, i8* %0)
|
||||
// CHECK32: tail call void @llvm.arm.neon.vst3lane.p0i8.v8bf16(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2)
|
||||
|
||||
void test_vst4_bf16(bfloat16_t *ptr, bfloat16x4x4_t val) {
|
||||
vst4_bf16(ptr, val);
|
||||
}
|
||||
// CHECK-LABEL: test_vst4_bf16
|
||||
// CHECK64: tail call void @llvm.aarch64.neon.st4.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, i8* %0)
|
||||
// CHECK32: tail call void @llvm.arm.neon.vst4.p0i8.v4bf16(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 2)
|
||||
|
||||
void test_vst4q_bf16(bfloat16_t *ptr, bfloat16x8x4_t val) {
|
||||
vst4q_bf16(ptr, val);
|
||||
}
|
||||
// CHECK-LABEL: test_vst4q_bf16
|
||||
// CHECK64: tail call void @llvm.aarch64.neon.st4.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, i8* %0)
|
||||
// CHECK32: tail call void @llvm.arm.neon.vst4.p0i8.v8bf16(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 2)
|
||||
|
||||
void test_vst4_lane_bf16(bfloat16_t *ptr, bfloat16x4x4_t val) {
|
||||
vst4_lane_bf16(ptr, val, 1);
|
||||
}
|
||||
// CHECK-LABEL: test_vst4_lane_bf16
|
||||
// CHECK64: tail call void @llvm.aarch64.neon.st4lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, i64 1, i8* %0)
|
||||
// CHECK32: tail call void @llvm.arm.neon.vst4lane.p0i8.v4bf16(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2)
|
||||
|
||||
void test_vst4q_lane_bf16(bfloat16_t *ptr, bfloat16x8x4_t val) {
|
||||
vst4q_lane_bf16(ptr, val, 7);
|
||||
}
|
||||
// CHECK-LABEL: test_vst4q_lane_bf16
|
||||
// CHECK64: tail call void @llvm.aarch64.neon.st4lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, i64 7, i8* %0)
|
||||
// CHECK32: tail call void @llvm.arm.neon.vst4lane.p0i8.v8bf16(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2)
|
|
@ -0,0 +1,102 @@
|
|||
// RUN: %clang_cc1 -triple aarch64-arm-none-eabi -target-feature +neon -target-feature +bf16 \
|
||||
// RUN: -O2 -fallow-half-arguments-and-returns -verify -fsyntax-only %s
|
||||
|
||||
#include "arm_neon.h"
|
||||
|
||||
int x;
|
||||
|
||||
bfloat16x4_t test_vld1_lane_bf16(bfloat16_t const *ptr, bfloat16x4_t src) {
|
||||
(void)vld1_lane_bf16(ptr, src, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
(void)vld1_lane_bf16(ptr, src, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
return vld1_lane_bf16(ptr, src, x); // expected-error-re {{argument {{.*}} must be a constant integer}}
|
||||
}
|
||||
|
||||
bfloat16x8_t test_vld1q_lane_bf16(bfloat16_t const *ptr, bfloat16x8_t src) {
|
||||
(void)vld1q_lane_bf16(ptr, src, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
(void)vld1q_lane_bf16(ptr, src, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
return vld1q_lane_bf16(ptr, src, x); // expected-error-re {{argument {{.*}} must be a constant integer}}
|
||||
}
|
||||
|
||||
bfloat16x4x2_t test_vld2_lane_bf16(bfloat16_t const *ptr, bfloat16x4x2_t src) {
|
||||
(void)vld2_lane_bf16(ptr, src, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
(void)vld2_lane_bf16(ptr, src, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
return vld2_lane_bf16(ptr, src, x); // expected-error-re {{argument {{.*}} must be a constant integer}}
|
||||
}
|
||||
|
||||
bfloat16x8x2_t test_vld2q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x2_t src) {
|
||||
(void)vld2q_lane_bf16(ptr, src, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
(void)vld2q_lane_bf16(ptr, src, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
return vld2q_lane_bf16(ptr, src, x); // expected-error-re {{argument {{.*}} must be a constant integer}}
|
||||
}
|
||||
|
||||
bfloat16x4x3_t test_vld3_lane_bf16(bfloat16_t const *ptr, bfloat16x4x3_t src) {
|
||||
(void)vld3_lane_bf16(ptr, src, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
(void)vld3_lane_bf16(ptr, src, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
return vld3_lane_bf16(ptr, src, x); // expected-error-re {{argument {{.*}} must be a constant integer}}
|
||||
}
|
||||
|
||||
bfloat16x8x3_t test_vld3q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x3_t src) {
|
||||
(void)vld3q_lane_bf16(ptr, src, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
(void)vld3q_lane_bf16(ptr, src, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
return vld3q_lane_bf16(ptr, src, x); // expected-error-re {{argument {{.*}} must be a constant integer}}
|
||||
}
|
||||
|
||||
bfloat16x4x4_t test_vld4_lane_bf16(bfloat16_t const *ptr, bfloat16x4x4_t src) {
|
||||
(void)vld4_lane_bf16(ptr, src, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
(void)vld4_lane_bf16(ptr, src, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
return vld4_lane_bf16(ptr, src, x); // expected-error-re {{argument {{.*}} must be a constant integer}}
|
||||
}
|
||||
|
||||
bfloat16x8x4_t test_vld4q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x4_t src) {
|
||||
(void)vld4q_lane_bf16(ptr, src, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
(void)vld4q_lane_bf16(ptr, src, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
return vld4q_lane_bf16(ptr, src, x); // expected-error-re {{argument {{.*}} must be a constant integer}}
|
||||
}
|
||||
|
||||
void test_vst1_lane_bf16(bfloat16_t *ptr, bfloat16x4_t val) {
|
||||
vst1_lane_bf16(ptr, val, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
vst1_lane_bf16(ptr, val, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
vst1_lane_bf16(ptr, val, x); // expected-error-re {{argument {{.*}} must be a constant integer}}
|
||||
}
|
||||
|
||||
void test_vst1q_lane_bf16(bfloat16_t *ptr, bfloat16x8_t val) {
|
||||
vst1q_lane_bf16(ptr, val, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
vst1q_lane_bf16(ptr, val, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
vst1q_lane_bf16(ptr, val, x); // expected-error-re {{argument {{.*}} must be a constant integer}}
|
||||
}
|
||||
|
||||
void test_vst2_lane_bf16(bfloat16_t *ptr, bfloat16x4x2_t val) {
|
||||
vst2_lane_bf16(ptr, val, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
vst2_lane_bf16(ptr, val, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
vst2_lane_bf16(ptr, val, x); // expected-error-re {{argument {{.*}} must be a constant integer}}
|
||||
}
|
||||
|
||||
void test_vst2q_lane_bf16(bfloat16_t *ptr, bfloat16x8x2_t val) {
|
||||
vst2q_lane_bf16(ptr, val, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
vst2q_lane_bf16(ptr, val, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
vst2q_lane_bf16(ptr, val, x); // expected-error-re {{argument {{.*}} must be a constant integer}}
|
||||
}
|
||||
|
||||
void test_vst3_lane_bf16(bfloat16_t *ptr, bfloat16x4x3_t val) {
|
||||
vst3_lane_bf16(ptr, val, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
vst3_lane_bf16(ptr, val, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
vst3_lane_bf16(ptr, val, x); // expected-error-re {{argument {{.*}} must be a constant integer}}
|
||||
}
|
||||
|
||||
void test_vst3q_lane_bf16(bfloat16_t *ptr, bfloat16x8x3_t val) {
|
||||
vst3q_lane_bf16(ptr, val, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
vst3q_lane_bf16(ptr, val, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
vst3q_lane_bf16(ptr, val, x); // expected-error-re {{argument {{.*}} must be a constant integer}}
|
||||
}
|
||||
|
||||
void test_vst4_lane_bf16(bfloat16_t *ptr, bfloat16x4x4_t val) {
|
||||
vst4_lane_bf16(ptr, val, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
vst4_lane_bf16(ptr, val, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
vst4_lane_bf16(ptr, val, x); // expected-error-re {{argument {{.*}} must be a constant integer}}
|
||||
}
|
||||
|
||||
void test_vst4q_lane_bf16(bfloat16_t *ptr, bfloat16x8x4_t val) {
|
||||
vst4q_lane_bf16(ptr, val, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
vst4q_lane_bf16(ptr, val, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}}
|
||||
vst4q_lane_bf16(ptr, val, x); // expected-error-re {{argument {{.*}} must be a constant integer}}
|
||||
}
|
|
@ -3405,10 +3405,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
|
||||
SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -3432,10 +3432,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
|
||||
SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -3459,10 +3459,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
|
||||
SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -3486,10 +3486,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
|
||||
SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -3513,10 +3513,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
|
||||
SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -3540,10 +3540,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
|
||||
SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -3567,10 +3567,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
|
||||
SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -3594,10 +3594,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
|
||||
SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -3621,10 +3621,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
|
||||
SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -3646,7 +3646,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
SelectLoadLane(Node, 2, AArch64::LD2i8);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
|
||||
VT == MVT::v8f16) {
|
||||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
|
||||
SelectLoadLane(Node, 2, AArch64::LD2i16);
|
||||
return;
|
||||
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
|
||||
|
@ -3664,7 +3664,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
SelectLoadLane(Node, 3, AArch64::LD3i8);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
|
||||
VT == MVT::v8f16) {
|
||||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
|
||||
SelectLoadLane(Node, 3, AArch64::LD3i16);
|
||||
return;
|
||||
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
|
||||
|
@ -3682,7 +3682,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
SelectLoadLane(Node, 4, AArch64::LD4i8);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
|
||||
VT == MVT::v8f16) {
|
||||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
|
||||
SelectLoadLane(Node, 4, AArch64::LD4i16);
|
||||
return;
|
||||
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
|
||||
|
@ -3757,10 +3757,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectStore(Node, 2, AArch64::ST1Twov16b);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
|
||||
VT == MVT::v4bf16) {
|
||||
SelectStore(Node, 2, AArch64::ST1Twov4h);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
|
||||
VT == MVT::v8bf16) {
|
||||
SelectStore(Node, 2, AArch64::ST1Twov8h);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -3785,10 +3787,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectStore(Node, 3, AArch64::ST1Threev16b);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
|
||||
VT == MVT::v4bf16) {
|
||||
SelectStore(Node, 3, AArch64::ST1Threev4h);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
|
||||
VT == MVT::v8bf16) {
|
||||
SelectStore(Node, 3, AArch64::ST1Threev8h);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -3813,10 +3817,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectStore(Node, 4, AArch64::ST1Fourv16b);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
|
||||
VT == MVT::v4bf16) {
|
||||
SelectStore(Node, 4, AArch64::ST1Fourv4h);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
|
||||
VT == MVT::v8bf16) {
|
||||
SelectStore(Node, 4, AArch64::ST1Fourv8h);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -3841,10 +3847,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectStore(Node, 2, AArch64::ST2Twov16b);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
|
||||
VT == MVT::v4bf16) {
|
||||
SelectStore(Node, 2, AArch64::ST2Twov4h);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
|
||||
VT == MVT::v8bf16) {
|
||||
SelectStore(Node, 2, AArch64::ST2Twov8h);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -3869,10 +3877,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectStore(Node, 3, AArch64::ST3Threev16b);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
|
||||
VT == MVT::v4bf16) {
|
||||
SelectStore(Node, 3, AArch64::ST3Threev4h);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
|
||||
VT == MVT::v8bf16) {
|
||||
SelectStore(Node, 3, AArch64::ST3Threev8h);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -3897,10 +3907,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectStore(Node, 4, AArch64::ST4Fourv16b);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
|
||||
VT == MVT::v4bf16) {
|
||||
SelectStore(Node, 4, AArch64::ST4Fourv4h);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
|
||||
VT == MVT::v8bf16) {
|
||||
SelectStore(Node, 4, AArch64::ST4Fourv8h);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -3923,7 +3935,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
SelectStoreLane(Node, 2, AArch64::ST2i8);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
|
||||
VT == MVT::v8f16) {
|
||||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
|
||||
SelectStoreLane(Node, 2, AArch64::ST2i16);
|
||||
return;
|
||||
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
|
||||
|
@ -3942,7 +3954,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
SelectStoreLane(Node, 3, AArch64::ST3i8);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
|
||||
VT == MVT::v8f16) {
|
||||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
|
||||
SelectStoreLane(Node, 3, AArch64::ST3i16);
|
||||
return;
|
||||
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
|
||||
|
@ -3961,7 +3973,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
SelectStoreLane(Node, 4, AArch64::ST4i8);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
|
||||
VT == MVT::v8f16) {
|
||||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
|
||||
SelectStoreLane(Node, 4, AArch64::ST4i16);
|
||||
return;
|
||||
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
|
||||
|
@ -4045,10 +4057,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
|
||||
SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -4073,10 +4085,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
|
||||
SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -4101,10 +4113,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
|
||||
SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -4129,10 +4141,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
|
||||
SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -4157,10 +4169,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
|
||||
SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -4185,10 +4197,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
|
||||
SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -4213,10 +4225,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
|
||||
SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -4241,10 +4253,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
|
||||
SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -4269,10 +4281,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
|
||||
SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -4297,10 +4309,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
|
||||
SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -4323,7 +4335,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
|
||||
VT == MVT::v8f16) {
|
||||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
|
||||
SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
|
||||
|
@ -4342,7 +4354,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
|
||||
VT == MVT::v8f16) {
|
||||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
|
||||
SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
|
||||
|
@ -4361,7 +4373,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
|
||||
VT == MVT::v8f16) {
|
||||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
|
||||
SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
|
||||
|
@ -4380,7 +4392,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
|
||||
VT == MVT::v8f16) {
|
||||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
|
||||
SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
|
||||
|
@ -4402,10 +4414,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
|
||||
SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -4431,10 +4443,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
|
||||
SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -4460,10 +4472,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
|
||||
SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -4489,10 +4501,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
|
||||
SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -4518,10 +4530,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16 ) {
|
||||
SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -4547,10 +4559,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
} else if (VT == MVT::v16i8) {
|
||||
SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
|
||||
SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
|
||||
SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
|
||||
|
@ -4574,7 +4586,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
|
||||
VT == MVT::v8f16) {
|
||||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
|
||||
SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
|
||||
|
@ -4594,7 +4606,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
|
||||
VT == MVT::v8f16) {
|
||||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
|
||||
SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
|
||||
|
@ -4614,7 +4626,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
|
||||
VT == MVT::v8f16) {
|
||||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
|
||||
SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
|
||||
return;
|
||||
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
|
||||
|
|
|
@ -2267,6 +2267,7 @@ let Predicates = [IsLE] in {
|
|||
defm : VecROLoadPat<ro64, v8i8, LDRDroW, LDRDroX>;
|
||||
defm : VecROLoadPat<ro64, v4i16, LDRDroW, LDRDroX>;
|
||||
defm : VecROLoadPat<ro64, v4f16, LDRDroW, LDRDroX>;
|
||||
defm : VecROLoadPat<ro64, v4bf16, LDRDroW, LDRDroX>;
|
||||
}
|
||||
|
||||
defm : VecROLoadPat<ro64, v1i64, LDRDroW, LDRDroX>;
|
||||
|
@ -2281,6 +2282,7 @@ let Predicates = [IsLE] in {
|
|||
defm : VecROLoadPat<ro128, v4f32, LDRQroW, LDRQroX>;
|
||||
defm : VecROLoadPat<ro128, v8i16, LDRQroW, LDRQroX>;
|
||||
defm : VecROLoadPat<ro128, v8f16, LDRQroW, LDRQroX>;
|
||||
defm : VecROLoadPat<ro128, v8bf16, LDRQroW, LDRQroX>;
|
||||
defm : VecROLoadPat<ro128, v16i8, LDRQroW, LDRQroX>;
|
||||
}
|
||||
} // AddedComplexity = 10
|
||||
|
@ -2416,6 +2418,8 @@ let Predicates = [IsLE] in {
|
|||
(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
|
||||
def : Pat<(v4f16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
|
||||
(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
|
||||
def : Pat<(v4bf16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
|
||||
(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
|
||||
}
|
||||
def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
|
||||
(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
|
||||
|
@ -2439,6 +2443,8 @@ let Predicates = [IsLE] in {
|
|||
(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
|
||||
def : Pat<(v8f16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
|
||||
(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
|
||||
def : Pat<(v8bf16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
|
||||
(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
|
||||
}
|
||||
def : Pat<(f128 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
|
||||
(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
|
||||
|
@ -2937,6 +2943,7 @@ let Predicates = [IsLE] in {
|
|||
defm : VecROStorePat<ro64, v4i16, FPR64, STRDroW, STRDroX>;
|
||||
defm : VecROStorePat<ro64, v8i8, FPR64, STRDroW, STRDroX>;
|
||||
defm : VecROStorePat<ro64, v4f16, FPR64, STRDroW, STRDroX>;
|
||||
defm : VecROStorePat<ro64, v4bf16, FPR64, STRDroW, STRDroX>;
|
||||
}
|
||||
|
||||
defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>;
|
||||
|
@ -2952,6 +2959,7 @@ let Predicates = [IsLE, UseSTRQro] in {
|
|||
defm : VecROStorePat<ro128, v8i16, FPR128, STRQroW, STRQroX>;
|
||||
defm : VecROStorePat<ro128, v16i8, FPR128, STRQroW, STRQroX>;
|
||||
defm : VecROStorePat<ro128, v8f16, FPR128, STRQroW, STRQroX>;
|
||||
defm : VecROStorePat<ro128, v8bf16, FPR128, STRQroW, STRQroX>;
|
||||
}
|
||||
} // AddedComplexity = 10
|
||||
|
||||
|
@ -3044,6 +3052,9 @@ let Predicates = [IsLE] in {
|
|||
def : Pat<(store (v4f16 FPR64:$Rt),
|
||||
(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
|
||||
(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
|
||||
def : Pat<(store (v4bf16 FPR64:$Rt),
|
||||
(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
|
||||
(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
|
||||
}
|
||||
|
||||
// Match all store 128 bits width whose type is compatible with FPR128
|
||||
|
@ -3074,6 +3085,9 @@ let Predicates = [IsLE] in {
|
|||
def : Pat<(store (v8f16 FPR128:$Rt),
|
||||
(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
|
||||
(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
|
||||
def : Pat<(store (v8bf16 FPR128:$Rt),
|
||||
(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
|
||||
(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
|
||||
}
|
||||
|
||||
// truncstore i64
|
||||
|
@ -3181,6 +3195,9 @@ let Predicates = [IsLE] in {
|
|||
def : Pat<(store (v4f16 FPR64:$Rt),
|
||||
(am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
|
||||
(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
|
||||
def : Pat<(store (v4bf16 FPR64:$Rt),
|
||||
(am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
|
||||
(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
|
||||
}
|
||||
|
||||
// Match all store 128 bits width whose type is compatible with FPR128
|
||||
|
@ -3213,6 +3230,9 @@ let Predicates = [IsLE] in {
|
|||
def : Pat<(store (v8f16 FPR128:$Rt),
|
||||
(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
|
||||
(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
|
||||
def : Pat<(store (v8bf16 FPR128:$Rt),
|
||||
(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
|
||||
(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
|
||||
}
|
||||
|
||||
} // AddedComplexity = 10
|
||||
|
@ -6350,6 +6370,10 @@ def : Pat<(v4f16 (AArch64dup (f16 (load GPR64sp:$Rn)))),
|
|||
(LD1Rv4h GPR64sp:$Rn)>;
|
||||
def : Pat<(v8f16 (AArch64dup (f16 (load GPR64sp:$Rn)))),
|
||||
(LD1Rv8h GPR64sp:$Rn)>;
|
||||
def : Pat<(v4bf16 (AArch64dup (bf16 (load GPR64sp:$Rn)))),
|
||||
(LD1Rv4h GPR64sp:$Rn)>;
|
||||
def : Pat<(v8bf16 (AArch64dup (bf16 (load GPR64sp:$Rn)))),
|
||||
(LD1Rv8h GPR64sp:$Rn)>;
|
||||
|
||||
class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
|
||||
ValueType VTy, ValueType STy, Instruction LD1>
|
||||
|
@ -6364,6 +6388,7 @@ def : Ld1Lane128Pat<load, VectorIndexS, v4f32, f32, LD1i32>;
|
|||
def : Ld1Lane128Pat<load, VectorIndexD, v2i64, i64, LD1i64>;
|
||||
def : Ld1Lane128Pat<load, VectorIndexD, v2f64, f64, LD1i64>;
|
||||
def : Ld1Lane128Pat<load, VectorIndexH, v8f16, f16, LD1i16>;
|
||||
def : Ld1Lane128Pat<load, VectorIndexH, v8bf16, bf16, LD1i16>;
|
||||
|
||||
class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
|
||||
ValueType VTy, ValueType STy, Instruction LD1>
|
||||
|
@ -6379,6 +6404,7 @@ def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
|
|||
def : Ld1Lane64Pat<load, VectorIndexS, v2i32, i32, LD1i32>;
|
||||
def : Ld1Lane64Pat<load, VectorIndexS, v2f32, f32, LD1i32>;
|
||||
def : Ld1Lane64Pat<load, VectorIndexH, v4f16, f16, LD1i16>;
|
||||
def : Ld1Lane64Pat<load, VectorIndexH, v4bf16, bf16, LD1i16>;
|
||||
|
||||
|
||||
defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
|
||||
|
@ -6407,6 +6433,7 @@ def : St1Lane128Pat<store, VectorIndexS, v4f32, f32, ST1i32>;
|
|||
def : St1Lane128Pat<store, VectorIndexD, v2i64, i64, ST1i64>;
|
||||
def : St1Lane128Pat<store, VectorIndexD, v2f64, f64, ST1i64>;
|
||||
def : St1Lane128Pat<store, VectorIndexH, v8f16, f16, ST1i16>;
|
||||
def : St1Lane128Pat<store, VectorIndexH, v8bf16, bf16, ST1i16>;
|
||||
|
||||
let AddedComplexity = 19 in
|
||||
class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
|
||||
|
@ -6422,6 +6449,7 @@ def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
|
|||
def : St1Lane64Pat<store, VectorIndexS, v2i32, i32, ST1i32>;
|
||||
def : St1Lane64Pat<store, VectorIndexS, v2f32, f32, ST1i32>;
|
||||
def : St1Lane64Pat<store, VectorIndexH, v4f16, f16, ST1i16>;
|
||||
def : St1Lane64Pat<store, VectorIndexH, v4bf16, bf16, ST1i16>;
|
||||
|
||||
multiclass St1LanePost64Pat<SDPatternOperator scalar_store, Operand VecIndex,
|
||||
ValueType VTy, ValueType STy, Instruction ST1,
|
||||
|
@ -6447,6 +6475,7 @@ defm : St1LanePost64Pat<post_store, VectorIndexS, v2f32, f32, ST1i32_POST, 4>;
|
|||
defm : St1LanePost64Pat<post_store, VectorIndexD, v1i64, i64, ST1i64_POST, 8>;
|
||||
defm : St1LanePost64Pat<post_store, VectorIndexD, v1f64, f64, ST1i64_POST, 8>;
|
||||
defm : St1LanePost64Pat<post_store, VectorIndexH, v4f16, f16, ST1i16_POST, 2>;
|
||||
defm : St1LanePost64Pat<post_store, VectorIndexH, v4bf16, bf16, ST1i16_POST, 2>;
|
||||
|
||||
multiclass St1LanePost128Pat<SDPatternOperator scalar_store, Operand VecIndex,
|
||||
ValueType VTy, ValueType STy, Instruction ST1,
|
||||
|
@ -6471,6 +6500,7 @@ defm : St1LanePost128Pat<post_store, VectorIndexS, v4f32, f32, ST1i32_POST, 4>;
|
|||
defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>;
|
||||
defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>;
|
||||
defm : St1LanePost128Pat<post_store, VectorIndexH, v8f16, f16, ST1i16_POST, 2>;
|
||||
defm : St1LanePost128Pat<post_store, VectorIndexH, v8bf16, bf16, ST1i16_POST, 2>;
|
||||
|
||||
let mayStore = 1, hasSideEffects = 0 in {
|
||||
defm ST2 : SIMDStSingleB<1, 0b000, "st2", VecListTwob, GPR64pi2>;
|
||||
|
|
|
@ -0,0 +1,916 @@
|
|||
; RUN: llc -mtriple aarch64-arm-none-eabi -asm-verbose=1 -mattr=+bf16 %s -o - | FileCheck %s
|
||||
|
||||
%struct.bfloat16x4x2_t = type { [2 x <4 x bfloat>] }
|
||||
%struct.bfloat16x8x2_t = type { [2 x <8 x bfloat>] }
|
||||
%struct.bfloat16x4x3_t = type { [3 x <4 x bfloat>] }
|
||||
%struct.bfloat16x8x3_t = type { [3 x <8 x bfloat>] }
|
||||
%struct.bfloat16x4x4_t = type { [4 x <4 x bfloat>] }
|
||||
%struct.bfloat16x8x4_t = type { [4 x <8 x bfloat>] }
|
||||
|
||||
define <4 x bfloat> @test_vld1_bf16(bfloat* nocapture readonly %ptr) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld1_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ldr d0, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = bitcast bfloat* %ptr to <4 x bfloat>*
|
||||
%1 = load <4 x bfloat>, <4 x bfloat>* %0, align 2
|
||||
ret <4 x bfloat> %1
|
||||
}
|
||||
|
||||
define <8 x bfloat> @test_vld1q_bf16(bfloat* nocapture readonly %ptr) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld1q_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ldr q0, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = bitcast bfloat* %ptr to <8 x bfloat>*
|
||||
%1 = load <8 x bfloat>, <8 x bfloat>* %0, align 2
|
||||
ret <8 x bfloat> %1
|
||||
}
|
||||
|
||||
define <4 x bfloat> @test_vld1_lane_bf16(bfloat* nocapture readonly %ptr, <4 x bfloat> %src) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld1_lane_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: ld1 { v0.h }[0], [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%0 = load bfloat, bfloat* %ptr, align 2
|
||||
%vld1_lane = insertelement <4 x bfloat> %src, bfloat %0, i32 0
|
||||
ret <4 x bfloat> %vld1_lane
|
||||
}
|
||||
|
||||
define <8 x bfloat> @test_vld1q_lane_bf16(bfloat* nocapture readonly %ptr, <8 x bfloat> %src) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld1q_lane_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld1 { v0.h }[7], [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = load bfloat, bfloat* %ptr, align 2
|
||||
%vld1_lane = insertelement <8 x bfloat> %src, bfloat %0, i32 7
|
||||
ret <8 x bfloat> %vld1_lane
|
||||
}
|
||||
|
||||
define <4 x bfloat> @test_vld1_dup_bf16(bfloat* nocapture readonly %ptr) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld1_dup_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld1r { v0.4h }, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = load bfloat, bfloat* %ptr, align 2
|
||||
%1 = insertelement <4 x bfloat> undef, bfloat %0, i32 0
|
||||
%lane = shufflevector <4 x bfloat> %1, <4 x bfloat> undef, <4 x i32> zeroinitializer
|
||||
ret <4 x bfloat> %lane
|
||||
}
|
||||
|
||||
define %struct.bfloat16x4x2_t @test_vld1_bf16_x2(bfloat* %ptr) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld1_bf16_x2:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld1 { v0.4h, v1.4h }, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x2.v4bf16.p0bf16(bfloat* %ptr)
|
||||
%vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 0
|
||||
%vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 1
|
||||
%.fca.0.0.insert = insertvalue %struct.bfloat16x4x2_t undef, <4 x bfloat> %vld1xN.fca.0.extract, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.bfloat16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vld1xN.fca.1.extract, 0, 1
|
||||
ret %struct.bfloat16x4x2_t %.fca.0.1.insert
|
||||
}
|
||||
|
||||
declare { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x2.v4bf16.p0bf16(bfloat*) nounwind
|
||||
|
||||
define %struct.bfloat16x8x2_t @test_vld1q_bf16_x2(bfloat* %ptr) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld1q_bf16_x2:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld1 { v0.8h, v1.8h }, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x2.v8bf16.p0bf16(bfloat* %ptr)
|
||||
%vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 0
|
||||
%vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 1
|
||||
%.fca.0.0.insert = insertvalue %struct.bfloat16x8x2_t undef, <8 x bfloat> %vld1xN.fca.0.extract, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.bfloat16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vld1xN.fca.1.extract, 0, 1
|
||||
ret %struct.bfloat16x8x2_t %.fca.0.1.insert
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x2.v8bf16.p0bf16(bfloat*) nounwind
|
||||
|
||||
define %struct.bfloat16x4x3_t @test_vld1_bf16_x3(bfloat* %ptr) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld1_bf16_x3:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld1 { v0.4h, v1.4h, v2.4h }, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x3.v4bf16.p0bf16(bfloat* %ptr)
|
||||
%vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0
|
||||
%vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1
|
||||
%vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2
|
||||
%.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld1xN.fca.0.extract, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld1xN.fca.1.extract, 0, 1
|
||||
%.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld1xN.fca.2.extract, 0, 2
|
||||
ret %struct.bfloat16x4x3_t %.fca.0.2.insert
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x3.v4bf16.p0bf16(bfloat*) nounwind
|
||||
|
||||
define %struct.bfloat16x8x3_t @test_vld1q_bf16_x3(bfloat* %ptr) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld1q_bf16_x3:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld1 { v0.8h, v1.8h, v2.8h }, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x3.v8bf16.p0bf16(bfloat* %ptr)
|
||||
%vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0
|
||||
%vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1
|
||||
%vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2
|
||||
%.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld1xN.fca.0.extract, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld1xN.fca.1.extract, 0, 1
|
||||
%.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld1xN.fca.2.extract, 0, 2
|
||||
ret %struct.bfloat16x8x3_t %.fca.0.2.insert
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x3.v8bf16.p0bf16(bfloat*) nounwind
|
||||
|
||||
define %struct.bfloat16x4x4_t @test_vld1_bf16_x4(bfloat* %ptr) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld1_bf16_x4:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x4.v4bf16.p0bf16(bfloat* %ptr)
|
||||
%vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0
|
||||
%vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1
|
||||
%vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2
|
||||
%vld1xN.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 3
|
||||
%.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld1xN.fca.0.extract, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld1xN.fca.1.extract, 0, 1
|
||||
%.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld1xN.fca.2.extract, 0, 2
|
||||
%.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld1xN.fca.3.extract, 0, 3
|
||||
ret %struct.bfloat16x4x4_t %.fca.0.3.insert
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x4.v4bf16.p0bf16(bfloat*) nounwind
|
||||
|
||||
define %struct.bfloat16x8x4_t @test_vld1q_bf16_x4(bfloat* %ptr) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld1q_bf16_x4:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x4.v8bf16.p0bf16(bfloat* %ptr)
|
||||
%vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0
|
||||
%vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1
|
||||
%vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2
|
||||
%vld1xN.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 3
|
||||
%.fca.0.0.insert = insertvalue %struct.bfloat16x8x4_t undef, <8 x bfloat> %vld1xN.fca.0.extract, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.0.insert, <8 x bfloat> %vld1xN.fca.1.extract, 0, 1
|
||||
%.fca.0.2.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.1.insert, <8 x bfloat> %vld1xN.fca.2.extract, 0, 2
|
||||
%.fca.0.3.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.2.insert, <8 x bfloat> %vld1xN.fca.3.extract, 0, 3
|
||||
ret %struct.bfloat16x8x4_t %.fca.0.3.insert
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x4.v8bf16.p0bf16(bfloat*) nounwind
|
||||
|
||||
define <8 x bfloat> @test_vld1q_dup_bf16(bfloat* nocapture readonly %ptr) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld1q_dup_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld1r { v0.8h }, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = load bfloat, bfloat* %ptr, align 2
|
||||
%1 = insertelement <8 x bfloat> undef, bfloat %0, i32 0
|
||||
%lane = shufflevector <8 x bfloat> %1, <8 x bfloat> undef, <8 x i32> zeroinitializer
|
||||
ret <8 x bfloat> %lane
|
||||
}
|
||||
|
||||
define %struct.bfloat16x4x2_t @test_vld2_bf16(bfloat* %ptr) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld2_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld2 { v0.4h, v1.4h }, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = bitcast bfloat* %ptr to <4 x bfloat>*
|
||||
%vld2 = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2.v4bf16.p0v4bf16(<4 x bfloat>* %0)
|
||||
%vld2.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2, 0
|
||||
%vld2.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2, 1
|
||||
%.fca.0.0.insert = insertvalue %struct.bfloat16x4x2_t undef, <4 x bfloat> %vld2.fca.0.extract, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.bfloat16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vld2.fca.1.extract, 0, 1
|
||||
ret %struct.bfloat16x4x2_t %.fca.0.1.insert
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2.v4bf16.p0v4bf16(<4 x bfloat>*) nounwind
|
||||
|
||||
define %struct.bfloat16x8x2_t @test_vld2q_bf16(bfloat* %ptr) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld2q_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld2 { v0.8h, v1.8h }, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = bitcast bfloat* %ptr to <8 x bfloat>*
|
||||
%vld2 = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2.v8bf16.p0v8bf16(<8 x bfloat>* %0)
|
||||
%vld2.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2, 0
|
||||
%vld2.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2, 1
|
||||
%.fca.0.0.insert = insertvalue %struct.bfloat16x8x2_t undef, <8 x bfloat> %vld2.fca.0.extract, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.bfloat16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vld2.fca.1.extract, 0, 1
|
||||
ret %struct.bfloat16x8x2_t %.fca.0.1.insert
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2.v8bf16.p0v8bf16(<8 x bfloat>*) nounwind
|
||||
define %struct.bfloat16x4x2_t @test_vld2_lane_bf16(bfloat* %ptr, [2 x <4 x bfloat>] %src.coerce) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld2_lane_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: ld2 { v0.h, v1.h }[1], [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%src.coerce.fca.0.extract = extractvalue [2 x <4 x bfloat>] %src.coerce, 0
|
||||
%src.coerce.fca.1.extract = extractvalue [2 x <4 x bfloat>] %src.coerce, 1
|
||||
%0 = bitcast bfloat* %ptr to i8*
|
||||
%vld2_lane = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, i64 1, i8* %0)
|
||||
%vld2_lane.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane, 0
|
||||
%vld2_lane.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane, 1
|
||||
%.fca.0.0.insert = insertvalue %struct.bfloat16x4x2_t undef, <4 x bfloat> %vld2_lane.fca.0.extract, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.bfloat16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vld2_lane.fca.1.extract, 0, 1
|
||||
ret %struct.bfloat16x4x2_t %.fca.0.1.insert
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, i64, i8*) nounwind
|
||||
|
||||
define %struct.bfloat16x8x2_t @test_vld2q_lane_bf16(bfloat* %ptr, [2 x <8 x bfloat>] %src.coerce) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld2q_lane_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: ld2 { v0.h, v1.h }[7], [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%src.coerce.fca.0.extract = extractvalue [2 x <8 x bfloat>] %src.coerce, 0
|
||||
%src.coerce.fca.1.extract = extractvalue [2 x <8 x bfloat>] %src.coerce, 1
|
||||
%0 = bitcast bfloat* %ptr to i8*
|
||||
%vld2_lane = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, i64 7, i8* %0)
|
||||
%vld2_lane.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2_lane, 0
|
||||
%vld2_lane.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2_lane, 1
|
||||
%.fca.0.0.insert = insertvalue %struct.bfloat16x8x2_t undef, <8 x bfloat> %vld2_lane.fca.0.extract, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.bfloat16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vld2_lane.fca.1.extract, 0, 1
|
||||
ret %struct.bfloat16x8x2_t %.fca.0.1.insert
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, i64, i8*) nounwind
|
||||
|
||||
define %struct.bfloat16x4x3_t @test_vld3_bf16(bfloat* %ptr) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld3_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld3 { v0.4h, v1.4h, v2.4h }, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = bitcast bfloat* %ptr to <4 x bfloat>*
|
||||
%vld3 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3.v4bf16.p0v4bf16(<4 x bfloat>* %0)
|
||||
%vld3.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 0
|
||||
%vld3.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 1
|
||||
%vld3.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 2
|
||||
%.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld3.fca.0.extract, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld3.fca.1.extract, 0, 1
|
||||
%.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld3.fca.2.extract, 0, 2
|
||||
ret %struct.bfloat16x4x3_t %.fca.0.2.insert
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3.v4bf16.p0v4bf16(<4 x bfloat>*) nounwind
|
||||
|
||||
define %struct.bfloat16x8x3_t @test_vld3q_bf16(bfloat* %ptr) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld3q_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld3 { v0.8h, v1.8h, v2.8h }, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = bitcast bfloat* %ptr to <8 x bfloat>*
|
||||
%vld3 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3.v8bf16.p0v8bf16(<8 x bfloat>* %0)
|
||||
%vld3.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 0
|
||||
%vld3.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 1
|
||||
%vld3.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 2
|
||||
%.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld3.fca.0.extract, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld3.fca.1.extract, 0, 1
|
||||
%.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld3.fca.2.extract, 0, 2
|
||||
ret %struct.bfloat16x8x3_t %.fca.0.2.insert
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3.v8bf16.p0v8bf16(<8 x bfloat>*) nounwind
|
||||
|
||||
define %struct.bfloat16x4x3_t @test_vld3_lane_bf16(bfloat* %ptr, [3 x <4 x bfloat>] %src.coerce) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld3_lane_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: ld3 { v0.h, v1.h, v2.h }[1], [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%src.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %src.coerce, 0
|
||||
%src.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %src.coerce, 1
|
||||
%src.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %src.coerce, 2
|
||||
%0 = bitcast bfloat* %ptr to i8*
|
||||
%vld3_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, i64 1, i8* %0)
|
||||
%vld3_lane.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane, 0
|
||||
%vld3_lane.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane, 1
|
||||
%vld3_lane.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane, 2
|
||||
%.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld3_lane.fca.0.extract, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld3_lane.fca.1.extract, 0, 1
|
||||
%.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld3_lane.fca.2.extract, 0, 2
|
||||
ret %struct.bfloat16x4x3_t %.fca.0.2.insert
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i64, i8*) nounwind
|
||||
|
||||
define %struct.bfloat16x8x3_t @test_vld3q_lane_bf16(bfloat* %ptr, [3 x <8 x bfloat>] %src.coerce) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld3q_lane_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECKT: ld3 { v0.h, v1.h, v2.h }[7], [x0]
|
||||
; CHECKT: ret
|
||||
entry:
|
||||
%src.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %src.coerce, 0
|
||||
%src.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %src.coerce, 1
|
||||
%src.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %src.coerce, 2
|
||||
%0 = bitcast bfloat* %ptr to i8*
|
||||
%vld3_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, i64 7, i8* %0)
|
||||
%vld3_lane.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3_lane, 0
|
||||
%vld3_lane.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3_lane, 1
|
||||
%vld3_lane.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3_lane, 2
|
||||
%.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld3_lane.fca.0.extract, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld3_lane.fca.1.extract, 0, 1
|
||||
%.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld3_lane.fca.2.extract, 0, 2
|
||||
ret %struct.bfloat16x8x3_t %.fca.0.2.insert
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i64, i8*) nounwind
|
||||
|
||||
define %struct.bfloat16x4x4_t @test_vld4_bf16(bfloat* %ptr) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld4_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = bitcast bfloat* %ptr to <4 x bfloat>*
|
||||
%vld4 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4.v4bf16.p0v4bf16(<4 x bfloat>* %0)
|
||||
%vld4.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 0
|
||||
%vld4.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 1
|
||||
%vld4.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 2
|
||||
%vld4.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 3
|
||||
%.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld4.fca.0.extract, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld4.fca.1.extract, 0, 1
|
||||
%.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld4.fca.2.extract, 0, 2
|
||||
%.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld4.fca.3.extract, 0, 3
|
||||
ret %struct.bfloat16x4x4_t %.fca.0.3.insert
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4.v4bf16.p0v4bf16(<4 x bfloat>*) nounwind
|
||||
|
||||
define %struct.bfloat16x8x4_t @test_vld4q_bf16(bfloat* %ptr) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld4q_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = bitcast bfloat* %ptr to <8 x bfloat>*
|
||||
%vld4 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4.v8bf16.p0v8bf16(<8 x bfloat>* %0)
|
||||
%vld4.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 0
|
||||
%vld4.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 1
|
||||
%vld4.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 2
|
||||
%vld4.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 3
|
||||
%.fca.0.0.insert = insertvalue %struct.bfloat16x8x4_t undef, <8 x bfloat> %vld4.fca.0.extract, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.0.insert, <8 x bfloat> %vld4.fca.1.extract, 0, 1
|
||||
%.fca.0.2.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.1.insert, <8 x bfloat> %vld4.fca.2.extract, 0, 2
|
||||
%.fca.0.3.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.2.insert, <8 x bfloat> %vld4.fca.3.extract, 0, 3
|
||||
ret %struct.bfloat16x8x4_t %.fca.0.3.insert
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4.v8bf16.p0v8bf16(<8 x bfloat>*) nounwind
|
||||
|
||||
define %struct.bfloat16x4x4_t @test_vld4_lane_bf16(bfloat* %ptr, [4 x <4 x bfloat>] %src.coerce) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld4_lane_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[1], [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%src.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 0
|
||||
%src.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 1
|
||||
%src.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 2
|
||||
%src.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 3
|
||||
%0 = bitcast bfloat* %ptr to i8*
|
||||
%vld4_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, <4 x bfloat> %src.coerce.fca.3.extract, i64 1, i8* %0)
|
||||
%vld4_lane.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 0
|
||||
%vld4_lane.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 1
|
||||
%vld4_lane.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 2
|
||||
%vld4_lane.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 3
|
||||
%.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld4_lane.fca.0.extract, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld4_lane.fca.1.extract, 0, 1
|
||||
%.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld4_lane.fca.2.extract, 0, 2
|
||||
%.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld4_lane.fca.3.extract, 0, 3
|
||||
ret %struct.bfloat16x4x4_t %.fca.0.3.insert
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i64, i8*) nounwind
|
||||
|
||||
define %struct.bfloat16x8x4_t @test_vld4q_lane_bf16(bfloat* %ptr, [4 x <8 x bfloat>] %src.coerce) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld4q_lane_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[7], [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%src.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 0
|
||||
%src.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 1
|
||||
%src.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 2
|
||||
%src.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 3
|
||||
%0 = bitcast bfloat* %ptr to i8*
|
||||
%vld4_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, <8 x bfloat> %src.coerce.fca.3.extract, i64 7, i8* %0)
|
||||
%vld4_lane.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 0
|
||||
%vld4_lane.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 1
|
||||
%vld4_lane.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 2
|
||||
%vld4_lane.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 3
|
||||
%.fca.0.0.insert = insertvalue %struct.bfloat16x8x4_t undef, <8 x bfloat> %vld4_lane.fca.0.extract, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.0.insert, <8 x bfloat> %vld4_lane.fca.1.extract, 0, 1
|
||||
%.fca.0.2.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.1.insert, <8 x bfloat> %vld4_lane.fca.2.extract, 0, 2
|
||||
%.fca.0.3.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.2.insert, <8 x bfloat> %vld4_lane.fca.3.extract, 0, 3
|
||||
ret %struct.bfloat16x8x4_t %.fca.0.3.insert
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i64, i8*) nounwind
|
||||
|
||||
define %struct.bfloat16x4x2_t @test_vld2_dup_bf16(bfloat* %ptr) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld2_dup_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld2r { v0.4h, v1.4h }, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%vld2 = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2r.v4bf16.p0bf16(bfloat* %ptr)
|
||||
%vld2.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2, 0
|
||||
%vld2.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2, 1
|
||||
%.fca.0.0.insert = insertvalue %struct.bfloat16x4x2_t undef, <4 x bfloat> %vld2.fca.0.extract, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.bfloat16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vld2.fca.1.extract, 0, 1
|
||||
ret %struct.bfloat16x4x2_t %.fca.0.1.insert
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2r.v4bf16.p0bf16(bfloat*) nounwind
|
||||
|
||||
define %struct.bfloat16x8x2_t @test_vld2q_dup_bf16(bfloat* %ptr) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld2q_dup_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld2r { v0.8h, v1.8h }, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%vld2 = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2r.v8bf16.p0bf16(bfloat* %ptr)
|
||||
%vld2.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2, 0
|
||||
%vld2.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2, 1
|
||||
%.fca.0.0.insert = insertvalue %struct.bfloat16x8x2_t undef, <8 x bfloat> %vld2.fca.0.extract, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.bfloat16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vld2.fca.1.extract, 0, 1
|
||||
ret %struct.bfloat16x8x2_t %.fca.0.1.insert
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2r.v8bf16.p0bf16(bfloat*) nounwind
|
||||
|
||||
define %struct.bfloat16x4x3_t @test_vld3_dup_bf16(bfloat* %ptr) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld3_dup_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld3r { v0.4h, v1.4h, v2.4h }, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%vld3 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3r.v4bf16.p0bf16(bfloat* %ptr)
|
||||
%vld3.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 0
|
||||
%vld3.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 1
|
||||
%vld3.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 2
|
||||
%.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld3.fca.0.extract, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld3.fca.1.extract, 0, 1
|
||||
%.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld3.fca.2.extract, 0, 2
|
||||
ret %struct.bfloat16x4x3_t %.fca.0.2.insert
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3r.v4bf16.p0bf16(bfloat*) nounwind
|
||||
|
||||
define %struct.bfloat16x8x3_t @test_vld3q_dup_bf16(bfloat* %ptr) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld3q_dup_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld3r { v0.8h, v1.8h, v2.8h }, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%vld3 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3r.v8bf16.p0bf16(bfloat* %ptr)
|
||||
%vld3.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 0
|
||||
%vld3.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 1
|
||||
%vld3.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 2
|
||||
%.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld3.fca.0.extract, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld3.fca.1.extract, 0, 1
|
||||
%.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld3.fca.2.extract, 0, 2
|
||||
ret %struct.bfloat16x8x3_t %.fca.0.2.insert
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3r.v8bf16.p0bf16(bfloat*) nounwind
|
||||
|
||||
define %struct.bfloat16x4x4_t @test_vld4_dup_bf16(bfloat* %ptr) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld4_dup_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld4r { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%vld4 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4r.v4bf16.p0bf16(bfloat* %ptr)
|
||||
%vld4.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 0
|
||||
%vld4.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 1
|
||||
%vld4.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 2
|
||||
%vld4.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 3
|
||||
%.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld4.fca.0.extract, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld4.fca.1.extract, 0, 1
|
||||
%.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld4.fca.2.extract, 0, 2
|
||||
%.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld4.fca.3.extract, 0, 3
|
||||
ret %struct.bfloat16x4x4_t %.fca.0.3.insert
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4r.v4bf16.p0bf16(bfloat*) nounwind
|
||||
|
||||
define %struct.bfloat16x8x4_t @test_vld4q_dup_bf16(bfloat* %ptr) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vld4q_dup_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%vld4 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4r.v8bf16.p0bf16(bfloat* %ptr)
|
||||
%vld4.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 0
|
||||
%vld4.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 1
|
||||
%vld4.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 2
|
||||
%vld4.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 3
|
||||
%.fca.0.0.insert = insertvalue %struct.bfloat16x8x4_t undef, <8 x bfloat> %vld4.fca.0.extract, 0, 0
|
||||
%.fca.0.1.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.0.insert, <8 x bfloat> %vld4.fca.1.extract, 0, 1
|
||||
%.fca.0.2.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.1.insert, <8 x bfloat> %vld4.fca.2.extract, 0, 2
|
||||
%.fca.0.3.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.2.insert, <8 x bfloat> %vld4.fca.3.extract, 0, 3
|
||||
ret %struct.bfloat16x8x4_t %.fca.0.3.insert
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind readonly
|
||||
declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4r.v8bf16.p0bf16(bfloat*) nounwind
|
||||
|
||||
define void @test_vst1_bf16(bfloat* nocapture %ptr, <4 x bfloat> %val) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vst1_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: str d0, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = bitcast bfloat* %ptr to <4 x bfloat>*
|
||||
store <4 x bfloat> %val, <4 x bfloat>* %0, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_vst1q_bf16(bfloat* nocapture %ptr, <8 x bfloat> %val) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vst1q_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: str q0, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = bitcast bfloat* %ptr to <8 x bfloat>*
|
||||
store <8 x bfloat> %val, <8 x bfloat>* %0, align 16
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_vst1_lane_bf16(bfloat* nocapture %ptr, <4 x bfloat> %val) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vst1_lane_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: st1 { v0.h }[1], [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%0 = extractelement <4 x bfloat> %val, i32 1
|
||||
store bfloat %0, bfloat* %ptr, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_vst1q_lane_bf16(bfloat* nocapture %ptr, <8 x bfloat> %val) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vst1q_lane_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK-NEXT: st1 { v0.h }[7], [x0]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = extractelement <8 x bfloat> %val, i32 7
|
||||
store bfloat %0, bfloat* %ptr, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_vst1_bf16_x2(bfloat* nocapture %ptr, [2 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vst1_bf16_x2:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: st1 { v0.4h, v1.4h }, [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%val.coerce.fca.0.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 0
|
||||
%val.coerce.fca.1.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 1
|
||||
tail call void @llvm.aarch64.neon.st1x2.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, bfloat* %ptr)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind
|
||||
declare void @llvm.aarch64.neon.st1x2.v4bf16.p0bf16(<4 x bfloat>, <4 x bfloat>, bfloat* nocapture) nounwind
|
||||
|
||||
define void @test_vst1q_bf16_x2(bfloat* nocapture %ptr, [2 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vst1q_bf16_x2:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: st1 { v0.8h, v1.8h }, [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%val.coerce.fca.0.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 0
|
||||
%val.coerce.fca.1.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 1
|
||||
tail call void @llvm.aarch64.neon.st1x2.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, bfloat* %ptr)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind
|
||||
declare void @llvm.aarch64.neon.st1x2.v8bf16.p0bf16(<8 x bfloat>, <8 x bfloat>, bfloat* nocapture) nounwind
|
||||
|
||||
define void @test_vst1_bf16_x3(bfloat* nocapture %ptr, [3 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vst1_bf16_x3:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: st1 { v0.4h, v1.4h, v2.4h }, [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%val.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 0
|
||||
%val.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 1
|
||||
%val.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 2
|
||||
tail call void @llvm.aarch64.neon.st1x3.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, bfloat* %ptr)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind
|
||||
declare void @llvm.aarch64.neon.st1x3.v4bf16.p0bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, bfloat* nocapture) nounwind
|
||||
|
||||
define void @test_vst1q_bf16_x3(bfloat* nocapture %ptr, [3 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vst1q_bf16_x3:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: st1 { v0.8h, v1.8h, v2.8h }, [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%val.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 0
|
||||
%val.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 1
|
||||
%val.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 2
|
||||
tail call void @llvm.aarch64.neon.st1x3.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, bfloat* %ptr)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind
|
||||
declare void @llvm.aarch64.neon.st1x3.v8bf16.p0bf16(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, bfloat* nocapture) nounwind
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define void @test_vst1_bf16_x4(bfloat* nocapture %ptr, [4 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vst1_bf16_x4:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%val.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 0
|
||||
%val.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 1
|
||||
%val.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 2
|
||||
%val.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 3
|
||||
tail call void @llvm.aarch64.neon.st1x4.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, bfloat* %ptr)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind
|
||||
declare void @llvm.aarch64.neon.st1x4.v4bf16.p0bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, bfloat* nocapture) nounwind
|
||||
|
||||
define void @test_vst1q_bf16_x4(bfloat* nocapture %ptr, [4 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vst1q_bf16_x4:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%val.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 0
|
||||
%val.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 1
|
||||
%val.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 2
|
||||
%val.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 3
|
||||
tail call void @llvm.aarch64.neon.st1x4.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, bfloat* %ptr)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind
|
||||
declare void @llvm.aarch64.neon.st1x4.v8bf16.p0bf16(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, bfloat* nocapture) nounwind
|
||||
|
||||
define void @test_vst2_bf16(bfloat* nocapture %ptr, [2 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vst2_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: st2 { v0.4h, v1.4h }, [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%val.coerce.fca.0.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 0
|
||||
%val.coerce.fca.1.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 1
|
||||
%0 = bitcast bfloat* %ptr to i8*
|
||||
tail call void @llvm.aarch64.neon.st2.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, i8* %0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind
|
||||
declare void @llvm.aarch64.neon.st2.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, i8* nocapture) nounwind
|
||||
|
||||
define void @test_vst2q_bf16(bfloat* nocapture %ptr, [2 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vst2q_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: st2 { v0.8h, v1.8h }, [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%val.coerce.fca.0.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 0
|
||||
%val.coerce.fca.1.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 1
|
||||
%0 = bitcast bfloat* %ptr to i8*
|
||||
tail call void @llvm.aarch64.neon.st2.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, i8* %0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind
|
||||
declare void @llvm.aarch64.neon.st2.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, i8* nocapture) nounwind
|
||||
|
||||
define void @test_vst2_lane_bf16(bfloat* nocapture %ptr, [2 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vst2_lane_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: st2 { v0.h, v1.h }[1], [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%val.coerce.fca.0.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 0
|
||||
%val.coerce.fca.1.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 1
|
||||
%0 = bitcast bfloat* %ptr to i8*
|
||||
tail call void @llvm.aarch64.neon.st2lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, i64 1, i8* %0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind
|
||||
declare void @llvm.aarch64.neon.st2lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, i64, i8* nocapture) nounwind
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define void @test_vst2q_lane_bf16(bfloat* nocapture %ptr, [2 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vst2q_lane_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: st2 { v0.h, v1.h }[7], [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%val.coerce.fca.0.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 0
|
||||
%val.coerce.fca.1.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 1
|
||||
%0 = bitcast bfloat* %ptr to i8*
|
||||
tail call void @llvm.aarch64.neon.st2lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, i64 7, i8* %0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind
|
||||
declare void @llvm.aarch64.neon.st2lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, i64, i8* nocapture) nounwind
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define void @test_vst3_bf16(bfloat* nocapture %ptr, [3 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vst3_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: st3 { v0.4h, v1.4h, v2.4h }, [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%val.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 0
|
||||
%val.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 1
|
||||
%val.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 2
|
||||
%0 = bitcast bfloat* %ptr to i8*
|
||||
tail call void @llvm.aarch64.neon.st3.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, i8* %0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind
|
||||
declare void @llvm.aarch64.neon.st3.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i8* nocapture) nounwind
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define void @test_vst3q_bf16(bfloat* nocapture %ptr, [3 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vst3q_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: st3 { v0.8h, v1.8h, v2.8h }, [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%val.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 0
|
||||
%val.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 1
|
||||
%val.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 2
|
||||
%0 = bitcast bfloat* %ptr to i8*
|
||||
tail call void @llvm.aarch64.neon.st3.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, i8* %0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind
|
||||
declare void @llvm.aarch64.neon.st3.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i8* nocapture) nounwind
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define void @test_vst3_lane_bf16(bfloat* nocapture %ptr, [3 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vst3_lane_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: st3 { v0.h, v1.h, v2.h }[1], [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%val.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 0
|
||||
%val.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 1
|
||||
%val.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 2
|
||||
%0 = bitcast bfloat* %ptr to i8*
|
||||
tail call void @llvm.aarch64.neon.st3lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, i64 1, i8* %0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind
|
||||
declare void @llvm.aarch64.neon.st3lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i64, i8* nocapture) nounwind
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define void @test_vst3q_lane_bf16(bfloat* nocapture %ptr, [3 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vst3q_lane_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: st3 { v0.h, v1.h, v2.h }[7], [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%val.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 0
|
||||
%val.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 1
|
||||
%val.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 2
|
||||
%0 = bitcast bfloat* %ptr to i8*
|
||||
tail call void @llvm.aarch64.neon.st3lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, i64 7, i8* %0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind
|
||||
declare void @llvm.aarch64.neon.st3lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i64, i8* nocapture) nounwind
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define void @test_vst4_bf16(bfloat* nocapture %ptr, [4 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vst4_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: st4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%val.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 0
|
||||
%val.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 1
|
||||
%val.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 2
|
||||
%val.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 3
|
||||
%0 = bitcast bfloat* %ptr to i8*
|
||||
tail call void @llvm.aarch64.neon.st4.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, i8* %0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind
|
||||
declare void @llvm.aarch64.neon.st4.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i8* nocapture) nounwind
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define void @test_vst4q_bf16(bfloat* nocapture %ptr, [4 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vst4q_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%val.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 0
|
||||
%val.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 1
|
||||
%val.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 2
|
||||
%val.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 3
|
||||
%0 = bitcast bfloat* %ptr to i8*
|
||||
tail call void @llvm.aarch64.neon.st4.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, i8* %0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind
|
||||
declare void @llvm.aarch64.neon.st4.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i8* nocapture) nounwind
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define void @test_vst4_lane_bf16(bfloat* nocapture %ptr, [4 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vst4_lane_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[1], [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%val.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 0
|
||||
%val.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 1
|
||||
%val.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 2
|
||||
%val.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 3
|
||||
%0 = bitcast bfloat* %ptr to i8*
|
||||
tail call void @llvm.aarch64.neon.st4lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, i64 1, i8* %0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind
|
||||
declare void @llvm.aarch64.neon.st4lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i64, i8* nocapture) nounwind
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define void @test_vst4q_lane_bf16(bfloat* nocapture %ptr, [4 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind {
|
||||
; CHECK-LABEL: test_vst4q_lane_bf16:
|
||||
; CHECK: // %bb.0: // %entry
|
||||
; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[7], [x0]
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%val.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 0
|
||||
%val.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 1
|
||||
%val.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 2
|
||||
%val.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 3
|
||||
%0 = bitcast bfloat* %ptr to i8*
|
||||
tail call void @llvm.aarch64.neon.st4lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, i64 7, i8* %0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind
|
||||
declare void @llvm.aarch64.neon.st4lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i64, i8* nocapture) nounwind
|
||||
|
||||
|
Loading…
Reference in New Issue