forked from OSchip/llvm-project
[ARM] Add FP_ROUND handling to splitting MVE stores
This splits MVE vector stores of a fp_trunc in the same way that we do for standard trunc's. It extends PerformSplittingToNarrowingStores to handle fp_round, splitting the store into pieces and adding a VCVTNb to perform the actual fp_round. The actual store is then converted to an integer store so that it can truncate bottom lanes of the result. Differential Revision: https://reviews.llvm.org/D81141
This commit is contained in:
parent
c25acec845
commit
0bfb4c2506
|
@ -14268,7 +14268,7 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
|
|||
if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
|
||||
return SDValue();
|
||||
SDValue Trunc = St->getValue();
|
||||
if (Trunc->getOpcode() != ISD::TRUNCATE)
|
||||
if (Trunc->getOpcode() != ISD::TRUNCATE && Trunc->getOpcode() != ISD::FP_ROUND)
|
||||
return SDValue();
|
||||
EVT FromVT = Trunc->getOperand(0).getValueType();
|
||||
EVT ToVT = Trunc.getValueType();
|
||||
|
@ -14283,7 +14283,10 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
|
|||
NumElements = 4;
|
||||
if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8)
|
||||
NumElements = 8;
|
||||
if (NumElements == 0 || FromVT.getVectorNumElements() == NumElements ||
|
||||
if (FromEltVT == MVT::f32 && ToEltVT == MVT::f16)
|
||||
NumElements = 4;
|
||||
if (NumElements == 0 ||
|
||||
(FromEltVT != MVT::f32 && FromVT.getVectorNumElements() == NumElements) ||
|
||||
FromVT.getVectorNumElements() % NumElements != 0)
|
||||
return SDValue();
|
||||
|
||||
|
@ -14293,7 +14296,7 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
|
|||
// rev: N 0 N+1 1 N+2 2 ...
|
||||
auto isVMOVNOriginalMask = [&](ArrayRef<int> M, bool rev) {
|
||||
unsigned NumElts = ToVT.getVectorNumElements();
|
||||
if (NumElts != M.size() || (ToVT != MVT::v8i16 && ToVT != MVT::v16i8))
|
||||
if (NumElts != M.size())
|
||||
return false;
|
||||
|
||||
unsigned Off0 = rev ? NumElts : 0;
|
||||
|
@ -14314,6 +14317,7 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
|
|||
isVMOVNOriginalMask(Shuffle->getMask(), true))
|
||||
return SDValue();
|
||||
|
||||
LLVMContext &C = *DAG.getContext();
|
||||
SDLoc DL(St);
|
||||
// Details about the old store
|
||||
SDValue Ch = St->getChain();
|
||||
|
@ -14322,8 +14326,11 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
|
|||
MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
|
||||
AAMDNodes AAInfo = St->getAAInfo();
|
||||
|
||||
EVT NewFromVT = EVT::getVectorVT(*DAG.getContext(), FromEltVT, NumElements);
|
||||
EVT NewToVT = EVT::getVectorVT(*DAG.getContext(), ToEltVT, NumElements);
|
||||
// We split the store into slices of NumElements. fp16 trunc stores are vcvt
|
||||
// and then stored as truncating integer stores.
|
||||
EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
|
||||
EVT NewToVT = EVT::getVectorVT(
|
||||
C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
|
||||
|
||||
SmallVector<SDValue, 4> Stores;
|
||||
for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
|
||||
|
@ -14333,6 +14340,14 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
|
|||
SDValue Extract =
|
||||
DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
|
||||
DAG.getConstant(i * NumElements, DL, MVT::i32));
|
||||
|
||||
if (ToEltVT == MVT::f16) {
|
||||
SDValue FPTrunc =
|
||||
DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
|
||||
Extract, DAG.getConstant(0, DL, MVT::i32));
|
||||
Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
|
||||
}
|
||||
|
||||
SDValue Store = DAG.getTruncStore(
|
||||
Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
|
||||
NewToVT, Alignment.value(), MMOFlags, AAInfo);
|
||||
|
|
|
@ -14,23 +14,8 @@ define void @to_4(float* nocapture readonly %x, half* noalias nocapture %y) {
|
|||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s8, s4
|
||||
; CHECK-NEXT: vmov r2, s8
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s8, s5
|
||||
; CHECK-NEXT: vmov r3, s8
|
||||
; CHECK-NEXT: vmov.16 q2[0], r2
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s6
|
||||
; CHECK-NEXT: vmov.16 q2[1], r3
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s4, s7
|
||||
; CHECK-NEXT: vmov.16 q2[2], r2
|
||||
; CHECK-NEXT: vmov r2, s4
|
||||
; CHECK-NEXT: vmov.16 q2[3], r2
|
||||
; CHECK-NEXT: vmov r2, s8
|
||||
; CHECK-NEXT: vmov r3, s9
|
||||
; CHECK-NEXT: str r2, [r1]
|
||||
; CHECK-NEXT: str r3, [r1, #4]
|
||||
; CHECK-NEXT: adds r1, #8
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
||||
; CHECK-NEXT: vstrh.32 q1, [r1], #8
|
||||
; CHECK-NEXT: le lr, .LBB0_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
|
@ -73,35 +58,14 @@ define void @to_8(float* nocapture readonly %x, half* noalias nocapture %y) {
|
|||
; CHECK-NEXT: dls lr, lr
|
||||
; CHECK-NEXT: .LBB1_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
||||
; CHECK-NEXT: vstrh.32 q1, [r1, #8]
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0], #32
|
||||
; CHECK-NEXT: vmul.f32 q2, q1, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s4, s8
|
||||
; CHECK-NEXT: vmov r2, s4
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s4, s9
|
||||
; CHECK-NEXT: vmov r3, s4
|
||||
; CHECK-NEXT: vmov.16 q1[0], r2
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s10
|
||||
; CHECK-NEXT: vmov.16 q1[1], r3
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s8, s11
|
||||
; CHECK-NEXT: vmov.16 q1[2], r2
|
||||
; CHECK-NEXT: vmov r2, s8
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0, #-16]
|
||||
; CHECK-NEXT: vmov.16 q1[3], r2
|
||||
; CHECK-NEXT: vmul.f32 q2, q2, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s8
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s9
|
||||
; CHECK-NEXT: vmov.16 q1[4], r2
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s10
|
||||
; CHECK-NEXT: vmov.16 q1[5], r2
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s8, s11
|
||||
; CHECK-NEXT: vmov.16 q1[6], r2
|
||||
; CHECK-NEXT: vmov r2, s8
|
||||
; CHECK-NEXT: vmov.16 q1[7], r2
|
||||
; CHECK-NEXT: vstrb.8 q1, [r1], #16
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
||||
; CHECK-NEXT: vstrh.32 q1, [r1], #16
|
||||
; CHECK-NEXT: le lr, .LBB1_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
|
@ -144,64 +108,22 @@ define void @to_16(float* nocapture readonly %x, half* noalias nocapture %y) {
|
|||
; CHECK-NEXT: dls lr, lr
|
||||
; CHECK-NEXT: .LBB2_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
||||
; CHECK-NEXT: vstrh.32 q1, [r1, #24]
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
|
||||
; CHECK-NEXT: vmul.f32 q2, q1, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s4, s8
|
||||
; CHECK-NEXT: vmov r2, s4
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s4, s9
|
||||
; CHECK-NEXT: vmov r3, s4
|
||||
; CHECK-NEXT: vmov.16 q1[0], r2
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s10
|
||||
; CHECK-NEXT: vmov.16 q1[1], r3
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s8, s11
|
||||
; CHECK-NEXT: vmov.16 q1[2], r2
|
||||
; CHECK-NEXT: vmov r2, s8
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
|
||||
; CHECK-NEXT: vmov.16 q1[3], r2
|
||||
; CHECK-NEXT: vmul.f32 q2, q2, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s8
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s9
|
||||
; CHECK-NEXT: vmov.16 q1[4], r2
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s10
|
||||
; CHECK-NEXT: vmov.16 q1[5], r2
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s8, s11
|
||||
; CHECK-NEXT: vmov.16 q1[6], r2
|
||||
; CHECK-NEXT: vmov r2, s8
|
||||
; CHECK-NEXT: vmov.16 q1[7], r2
|
||||
; CHECK-NEXT: vstrh.16 q1, [r1, #16]
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
||||
; CHECK-NEXT: vstrh.32 q1, [r1, #16]
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
||||
; CHECK-NEXT: vstrh.32 q1, [r1, #8]
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0], #64
|
||||
; CHECK-NEXT: vmul.f32 q2, q1, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s4, s9
|
||||
; CHECK-NEXT: vmov r2, s4
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s4, s8
|
||||
; CHECK-NEXT: vmov r3, s4
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s10
|
||||
; CHECK-NEXT: vmov.16 q1[0], r3
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s8, s11
|
||||
; CHECK-NEXT: vmov.16 q1[1], r2
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vmov.16 q1[2], r2
|
||||
; CHECK-NEXT: vmov r2, s8
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0, #-48]
|
||||
; CHECK-NEXT: vmov.16 q1[3], r2
|
||||
; CHECK-NEXT: vmul.f32 q2, q2, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s8
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s9
|
||||
; CHECK-NEXT: vmov.16 q1[4], r2
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s10
|
||||
; CHECK-NEXT: vmov.16 q1[5], r2
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s8, s11
|
||||
; CHECK-NEXT: vmov.16 q1[6], r2
|
||||
; CHECK-NEXT: vmov r2, s8
|
||||
; CHECK-NEXT: vmov.16 q1[7], r2
|
||||
; CHECK-NEXT: vstrh.16 q1, [r1], #32
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
||||
; CHECK-NEXT: vstrh.32 q1, [r1], #32
|
||||
; CHECK-NEXT: le lr, .LBB2_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
|
@ -452,23 +374,8 @@ define void @both_4(half* nocapture readonly %x, half* noalias nocapture %y) {
|
|||
; CHECK-NEXT: vcvtb.f32.f16 s13, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s12, s4
|
||||
; CHECK-NEXT: vmul.f32 q1, q3, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s8, s4
|
||||
; CHECK-NEXT: vmov r2, s8
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s8, s5
|
||||
; CHECK-NEXT: vmov r3, s8
|
||||
; CHECK-NEXT: vmov.16 q2[0], r2
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s6
|
||||
; CHECK-NEXT: vmov.16 q2[1], r3
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s4, s7
|
||||
; CHECK-NEXT: vmov.16 q2[2], r2
|
||||
; CHECK-NEXT: vmov r2, s4
|
||||
; CHECK-NEXT: vmov.16 q2[3], r2
|
||||
; CHECK-NEXT: vmov r2, s8
|
||||
; CHECK-NEXT: vmov r3, s9
|
||||
; CHECK-NEXT: str r2, [r1]
|
||||
; CHECK-NEXT: str r3, [r1, #4]
|
||||
; CHECK-NEXT: adds r1, #8
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
||||
; CHECK-NEXT: vstrh.32 q1, [r1], #8
|
||||
; CHECK-NEXT: le lr, .LBB6_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
|
@ -506,57 +413,33 @@ define void @both_8(half* nocapture readonly %x, half* noalias nocapture %y) {
|
|||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: .vsave {d8, d9}
|
||||
; CHECK-NEXT: vpush {d8, d9}
|
||||
; CHECK-NEXT: adr r2, .LCPI7_0
|
||||
; CHECK-NEXT: mov.w lr, #128
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||||
; CHECK-NEXT: dls lr, lr
|
||||
; CHECK-NEXT: .LBB7_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrh.u16 q2, [r0], #16
|
||||
; CHECK-NEXT: vmovx.f16 s6, s9
|
||||
; CHECK-NEXT: vmovx.f16 s4, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s15, s6
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s14, s9
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s13, s4
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s12, s8
|
||||
; CHECK-NEXT: vmul.f32 q3, q3, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s4, s12
|
||||
; CHECK-NEXT: vmov r2, s4
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s4, s13
|
||||
; CHECK-NEXT: vmov r3, s4
|
||||
; CHECK-NEXT: vmov.16 q1[0], r2
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s16, s14
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s15
|
||||
; CHECK-NEXT: vmovx.f16 s14, s11
|
||||
; CHECK-NEXT: vmov r2, s16
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s19, s14
|
||||
; CHECK-NEXT: vmov.16 q1[1], r3
|
||||
; CHECK-NEXT: vmov.16 q1[2], r2
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vmovx.f16 s12, s10
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s18, s11
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s17, s12
|
||||
; CHECK-NEXT: vmov.16 q1[3], r2
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s16, s10
|
||||
; CHECK-NEXT: vmul.f32 q2, q4, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s8
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s9
|
||||
; CHECK-NEXT: vmov.16 q1[4], r2
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s10
|
||||
; CHECK-NEXT: vmov.16 q1[5], r2
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s8, s11
|
||||
; CHECK-NEXT: vmov.16 q1[6], r2
|
||||
; CHECK-NEXT: vmov r2, s8
|
||||
; CHECK-NEXT: vmov.16 q1[7], r2
|
||||
; CHECK-NEXT: vstrb.8 q1, [r1], #16
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r0], #16
|
||||
; CHECK-NEXT: vmovx.f16 s10, s7
|
||||
; CHECK-NEXT: vmovx.f16 s8, s6
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s15, s10
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s14, s7
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s13, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s12, s6
|
||||
; CHECK-NEXT: vmul.f32 q2, q3, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q2, q2
|
||||
; CHECK-NEXT: vstrh.32 q2, [r1, #8]
|
||||
; CHECK-NEXT: vmovx.f16 s10, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s15, s10
|
||||
; CHECK-NEXT: vmovx.f16 s8, s4
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s14, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s13, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s12, s4
|
||||
; CHECK-NEXT: vmul.f32 q1, q3, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
||||
; CHECK-NEXT: vstrh.32 q1, [r1], #16
|
||||
; CHECK-NEXT: le lr, .LBB7_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: vpop {d8, d9}
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
; CHECK-NEXT: .p2align 4
|
||||
; CHECK-NEXT: @ %bb.3:
|
||||
|
@ -592,97 +475,52 @@ define void @both_16(half* nocapture readonly %x, half* noalias nocapture %y) {
|
|||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: .vsave {d8, d9}
|
||||
; CHECK-NEXT: vpush {d8, d9}
|
||||
; CHECK-NEXT: adr r2, .LCPI8_0
|
||||
; CHECK-NEXT: mov.w lr, #64
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||||
; CHECK-NEXT: dls lr, lr
|
||||
; CHECK-NEXT: .LBB8_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrh.u16 q2, [r0, #16]
|
||||
; CHECK-NEXT: vmovx.f16 s6, s9
|
||||
; CHECK-NEXT: vmovx.f16 s4, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s15, s6
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s14, s9
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s13, s4
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s12, s8
|
||||
; CHECK-NEXT: vmul.f32 q3, q3, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s4, s12
|
||||
; CHECK-NEXT: vmov r2, s4
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s4, s13
|
||||
; CHECK-NEXT: vmov r3, s4
|
||||
; CHECK-NEXT: vmov.16 q1[0], r2
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s16, s14
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s15
|
||||
; CHECK-NEXT: vmovx.f16 s14, s11
|
||||
; CHECK-NEXT: vmov r2, s16
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s19, s14
|
||||
; CHECK-NEXT: vmov.16 q1[1], r3
|
||||
; CHECK-NEXT: vmov.16 q1[2], r2
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vmovx.f16 s12, s10
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s18, s11
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s17, s12
|
||||
; CHECK-NEXT: vmov.16 q1[3], r2
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s16, s10
|
||||
; CHECK-NEXT: vmul.f32 q2, q4, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s8
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s9
|
||||
; CHECK-NEXT: vmov.16 q1[4], r2
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s10
|
||||
; CHECK-NEXT: vmov.16 q1[5], r2
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s8, s11
|
||||
; CHECK-NEXT: vmov.16 q1[6], r2
|
||||
; CHECK-NEXT: vmov r2, s8
|
||||
; CHECK-NEXT: vldrh.u16 q2, [r0], #32
|
||||
; CHECK-NEXT: vmov.16 q1[7], r2
|
||||
; CHECK-NEXT: vstrh.16 q1, [r1, #16]
|
||||
; CHECK-NEXT: vmovx.f16 s6, s9
|
||||
; CHECK-NEXT: vmovx.f16 s4, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s15, s6
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s14, s9
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s13, s4
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s12, s8
|
||||
; CHECK-NEXT: vmul.f32 q3, q3, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s4, s12
|
||||
; CHECK-NEXT: vmov r2, s4
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s4, s13
|
||||
; CHECK-NEXT: vmov r3, s4
|
||||
; CHECK-NEXT: vmov.16 q1[0], r2
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s16, s14
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s15
|
||||
; CHECK-NEXT: vmovx.f16 s14, s11
|
||||
; CHECK-NEXT: vmov r2, s16
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s19, s14
|
||||
; CHECK-NEXT: vmov.16 q1[1], r3
|
||||
; CHECK-NEXT: vmov.16 q1[2], r2
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vmovx.f16 s12, s10
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s18, s11
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s17, s12
|
||||
; CHECK-NEXT: vmov.16 q1[3], r2
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s16, s10
|
||||
; CHECK-NEXT: vmul.f32 q2, q4, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s8
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s9
|
||||
; CHECK-NEXT: vmov.16 q1[4], r2
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s10
|
||||
; CHECK-NEXT: vmov.16 q1[5], r2
|
||||
; CHECK-NEXT: vmov r2, s12
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s8, s11
|
||||
; CHECK-NEXT: vmov.16 q1[6], r2
|
||||
; CHECK-NEXT: vmov r2, s8
|
||||
; CHECK-NEXT: vmov.16 q1[7], r2
|
||||
; CHECK-NEXT: vstrh.16 q1, [r1], #32
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r0, #16]
|
||||
; CHECK-NEXT: vmovx.f16 s10, s7
|
||||
; CHECK-NEXT: vmovx.f16 s8, s6
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s15, s10
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s14, s7
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s13, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s12, s6
|
||||
; CHECK-NEXT: vmul.f32 q2, q3, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q2, q2
|
||||
; CHECK-NEXT: vstrh.32 q2, [r1, #24]
|
||||
; CHECK-NEXT: vmovx.f16 s10, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s15, s10
|
||||
; CHECK-NEXT: vmovx.f16 s8, s4
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s14, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s13, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s12, s4
|
||||
; CHECK-NEXT: vmul.f32 q1, q3, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
||||
; CHECK-NEXT: vstrh.32 q1, [r1, #16]
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r0], #32
|
||||
; CHECK-NEXT: vmovx.f16 s10, s7
|
||||
; CHECK-NEXT: vmovx.f16 s8, s6
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s15, s10
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s14, s7
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s13, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s12, s6
|
||||
; CHECK-NEXT: vmul.f32 q2, q3, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q2, q2
|
||||
; CHECK-NEXT: vstrh.32 q2, [r1, #8]
|
||||
; CHECK-NEXT: vmovx.f16 s10, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s15, s10
|
||||
; CHECK-NEXT: vmovx.f16 s8, s4
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s14, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s13, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s12, s4
|
||||
; CHECK-NEXT: vmul.f32 q1, q3, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
||||
; CHECK-NEXT: vstrh.32 q1, [r1], #32
|
||||
; CHECK-NEXT: le lr, .LBB8_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: vpop {d8, d9}
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
; CHECK-NEXT: .p2align 4
|
||||
; CHECK-NEXT: @ %bb.3:
|
||||
|
|
|
@ -345,21 +345,8 @@ entry:
|
|||
define arm_aapcs_vfpcc void @store_trunc_4(<4 x half>* %src, <4 x float> %val) {
|
||||
; CHECK-LABEL: store_trunc_4:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s4, s0
|
||||
; CHECK-NEXT: vmov r1, s4
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s4, s1
|
||||
; CHECK-NEXT: vmov r2, s4
|
||||
; CHECK-NEXT: vmov.16 q1[0], r1
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s8, s2
|
||||
; CHECK-NEXT: vmov.16 q1[1], r2
|
||||
; CHECK-NEXT: vmov r1, s8
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s0, s3
|
||||
; CHECK-NEXT: vmov.16 q1[2], r1
|
||||
; CHECK-NEXT: vmov r1, s0
|
||||
; CHECK-NEXT: vmov.16 q1[3], r1
|
||||
; CHECK-NEXT: vmov r2, s5
|
||||
; CHECK-NEXT: vmov r1, s4
|
||||
; CHECK-NEXT: strd r1, r2, [r0]
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q0, q0
|
||||
; CHECK-NEXT: vstrh.32 q0, [r0]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%e = fptrunc <4 x float> %val to <4 x half>
|
||||
|
@ -370,31 +357,10 @@ entry:
|
|||
define arm_aapcs_vfpcc void @store_trunc_8(<8 x half>* %src, <8 x float> %val) {
|
||||
; CHECK-LABEL: store_trunc_8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s8, s0
|
||||
; CHECK-NEXT: vmov r1, s8
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s8, s1
|
||||
; CHECK-NEXT: vmov r2, s8
|
||||
; CHECK-NEXT: vmov.16 q2[0], r1
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s2
|
||||
; CHECK-NEXT: vmov.16 q2[1], r2
|
||||
; CHECK-NEXT: vmov r1, s12
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s0, s3
|
||||
; CHECK-NEXT: vmov.16 q2[2], r1
|
||||
; CHECK-NEXT: vmov r1, s0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s0, s4
|
||||
; CHECK-NEXT: vmov.16 q2[3], r1
|
||||
; CHECK-NEXT: vmov r1, s0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s0, s5
|
||||
; CHECK-NEXT: vmov.16 q2[4], r1
|
||||
; CHECK-NEXT: vmov r1, s0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s0, s6
|
||||
; CHECK-NEXT: vmov.16 q2[5], r1
|
||||
; CHECK-NEXT: vmov r1, s0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s0, s7
|
||||
; CHECK-NEXT: vmov.16 q2[6], r1
|
||||
; CHECK-NEXT: vmov r1, s0
|
||||
; CHECK-NEXT: vmov.16 q2[7], r1
|
||||
; CHECK-NEXT: vstrw.32 q2, [r0]
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q0, q0
|
||||
; CHECK-NEXT: vstrh.32 q1, [r0, #8]
|
||||
; CHECK-NEXT: vstrh.32 q0, [r0]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%e = fptrunc <8 x float> %val to <8 x half>
|
||||
|
@ -405,59 +371,14 @@ entry:
|
|||
define arm_aapcs_vfpcc void @store_trunc_16(<16 x half>* %src, <16 x float> %val) {
|
||||
; CHECK-LABEL: store_trunc_16:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .vsave {d8, d9, d10}
|
||||
; CHECK-NEXT: vpush {d8, d9, d10}
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s16, s8
|
||||
; CHECK-NEXT: vmov r1, s16
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s16, s9
|
||||
; CHECK-NEXT: vmov r2, s16
|
||||
; CHECK-NEXT: vmov.16 q4[0], r1
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s20, s10
|
||||
; CHECK-NEXT: vmov.16 q4[1], r2
|
||||
; CHECK-NEXT: vmov r1, s20
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s8, s11
|
||||
; CHECK-NEXT: vmov.16 q4[2], r1
|
||||
; CHECK-NEXT: vmov r1, s8
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s8, s12
|
||||
; CHECK-NEXT: vmov.16 q4[3], r1
|
||||
; CHECK-NEXT: vmov r1, s8
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s8, s13
|
||||
; CHECK-NEXT: vmov.16 q4[4], r1
|
||||
; CHECK-NEXT: vmov r1, s8
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s8, s14
|
||||
; CHECK-NEXT: vmov.16 q4[5], r1
|
||||
; CHECK-NEXT: vmov r1, s8
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s8, s15
|
||||
; CHECK-NEXT: vmov.16 q4[6], r1
|
||||
; CHECK-NEXT: vmov r1, s8
|
||||
; CHECK-NEXT: vmov.16 q4[7], r1
|
||||
; CHECK-NEXT: vstrw.32 q4, [r0, #16]
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s8, s0
|
||||
; CHECK-NEXT: vmov r1, s8
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s8, s1
|
||||
; CHECK-NEXT: vmov r2, s8
|
||||
; CHECK-NEXT: vmov.16 q2[0], r1
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s12, s2
|
||||
; CHECK-NEXT: vmov.16 q2[1], r2
|
||||
; CHECK-NEXT: vmov r1, s12
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s0, s3
|
||||
; CHECK-NEXT: vmov.16 q2[2], r1
|
||||
; CHECK-NEXT: vmov r1, s0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s0, s4
|
||||
; CHECK-NEXT: vmov.16 q2[3], r1
|
||||
; CHECK-NEXT: vmov r1, s0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s0, s5
|
||||
; CHECK-NEXT: vmov.16 q2[4], r1
|
||||
; CHECK-NEXT: vmov r1, s0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s0, s6
|
||||
; CHECK-NEXT: vmov.16 q2[5], r1
|
||||
; CHECK-NEXT: vmov r1, s0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 s0, s7
|
||||
; CHECK-NEXT: vmov.16 q2[6], r1
|
||||
; CHECK-NEXT: vmov r1, s0
|
||||
; CHECK-NEXT: vmov.16 q2[7], r1
|
||||
; CHECK-NEXT: vstrw.32 q2, [r0]
|
||||
; CHECK-NEXT: vpop {d8, d9, d10}
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q3, q3
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q2, q2
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q0, q0
|
||||
; CHECK-NEXT: vstrh.32 q3, [r0, #24]
|
||||
; CHECK-NEXT: vstrh.32 q2, [r0, #16]
|
||||
; CHECK-NEXT: vstrh.32 q1, [r0, #8]
|
||||
; CHECK-NEXT: vstrh.32 q0, [r0]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%e = fptrunc <16 x float> %val to <16 x half>
|
||||
|
|
Loading…
Reference in New Issue