forked from OSchip/llvm-project
[ARM] Split FPExt loads
This extends PerformSplittingToWideningLoad to also handle FP_Ext, as well as sign and zero extends. It uses an integer extending load followed by a VCVTL on the bottom lanes to efficiently perform an fpext on a smaller than legal type. The existing code had to be rewritten a little to not just split the node in two and let legalization handle it from there, but to actually split into legal chunks. Differential Revision: https://reviews.llvm.org/D81340
This commit is contained in:
parent
06be4bb5e6
commit
d79b57b8bb
|
@ -967,6 +967,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
|
|||
setTargetDAGCombine(ISD::UMIN);
|
||||
setTargetDAGCombine(ISD::SMAX);
|
||||
setTargetDAGCombine(ISD::UMAX);
|
||||
setTargetDAGCombine(ISD::FP_EXTEND);
|
||||
}
|
||||
|
||||
if (!Subtarget->hasFP64()) {
|
||||
|
@ -15062,9 +15063,10 @@ static SDValue PerformShiftCombine(SDNode *N,
|
|||
return SDValue();
|
||||
}
|
||||
|
||||
// Look for a sign/zero extend of a larger than legal load. This can be split
|
||||
// into two extending loads, which are simpler to deal with than an arbitrary
|
||||
// sign extend.
|
||||
// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
|
||||
// split into multiple extending loads, which are simpler to deal with than an
|
||||
// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
|
||||
// to convert the type to an f32.
|
||||
static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
|
||||
SDValue N0 = N->getOperand(0);
|
||||
if (N0.getOpcode() != ISD::LOAD)
|
||||
|
@ -15086,12 +15088,15 @@ static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
|
|||
NumElements = 4;
|
||||
if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
|
||||
NumElements = 8;
|
||||
if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
|
||||
NumElements = 4;
|
||||
if (NumElements == 0 ||
|
||||
FromVT.getVectorNumElements() == NumElements ||
|
||||
(FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
|
||||
FromVT.getVectorNumElements() % NumElements != 0 ||
|
||||
!isPowerOf2_32(NumElements))
|
||||
return SDValue();
|
||||
|
||||
LLVMContext &C = *DAG.getContext();
|
||||
SDLoc DL(LD);
|
||||
// Details about the old load
|
||||
SDValue Ch = LD->getChain();
|
||||
|
@ -15103,28 +15108,43 @@ static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
|
|||
ISD::LoadExtType NewExtType =
|
||||
N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
|
||||
SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
|
||||
EVT NewFromVT = FromVT.getHalfNumVectorElementsVT(*DAG.getContext());
|
||||
EVT NewToVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
|
||||
unsigned NewOffset = NewFromVT.getSizeInBits() / 8;
|
||||
SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
|
||||
EVT NewFromVT = EVT::getVectorVT(
|
||||
C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
|
||||
EVT NewToVT = EVT::getVectorVT(
|
||||
C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
|
||||
|
||||
// Split the load in half, each side of which is extended separately. This
|
||||
// is good enough, as legalisation will take it from there. They are either
|
||||
// already legal or they will be split further into something that is
|
||||
// legal.
|
||||
SDValue NewLoad1 = DAG.getLoad(
|
||||
ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, BasePtr, Offset,
|
||||
LD->getPointerInfo(), NewFromVT, Alignment.value(), MMOFlags, AAInfo);
|
||||
SDValue NewLoad2 =
|
||||
DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
|
||||
LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
|
||||
Alignment.value(), MMOFlags, AAInfo);
|
||||
SmallVector<SDValue, 4> Loads;
|
||||
SmallVector<SDValue, 4> Chains;
|
||||
for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
|
||||
unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
|
||||
SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
|
||||
|
||||
SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
|
||||
SDValue(NewLoad1.getNode(), 1),
|
||||
SDValue(NewLoad2.getNode(), 1));
|
||||
SDValue NewLoad =
|
||||
DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
|
||||
LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
|
||||
Alignment.value(), MMOFlags, AAInfo);
|
||||
Loads.push_back(NewLoad);
|
||||
Chains.push_back(SDValue(NewLoad.getNode(), 1));
|
||||
}
|
||||
|
||||
// Float truncs need to extended with VCVTB's into their floating point types.
|
||||
if (FromEltVT == MVT::f16) {
|
||||
SmallVector<SDValue, 4> Extends;
|
||||
|
||||
for (unsigned i = 0; i < Loads.size(); i++) {
|
||||
SDValue LoadBC =
|
||||
DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
|
||||
SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
|
||||
DAG.getConstant(0, DL, MVT::i32));
|
||||
Extends.push_back(FPExt);
|
||||
}
|
||||
|
||||
Loads = Extends;
|
||||
}
|
||||
|
||||
SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
|
||||
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
|
||||
return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, NewLoad1, NewLoad2);
|
||||
return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
|
||||
}
|
||||
|
||||
/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
|
||||
|
@ -15172,6 +15192,15 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
|
|||
return SDValue();
|
||||
}
|
||||
|
||||
static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG,
|
||||
const ARMSubtarget *ST) {
|
||||
if (ST->hasMVEFloatOps())
|
||||
if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
|
||||
return NewLoad;
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
|
||||
/// saturates.
|
||||
static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG,
|
||||
|
@ -15830,6 +15859,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
|
|||
case ISD::ZERO_EXTEND:
|
||||
case ISD::ANY_EXTEND:
|
||||
return PerformExtendCombine(N, DCI.DAG, Subtarget);
|
||||
case ISD::FP_EXTEND:
|
||||
return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
|
||||
case ISD::SMIN:
|
||||
case ISD::UMIN:
|
||||
case ISD::SMAX:
|
||||
|
|
|
@ -166,18 +166,9 @@ define void @from_4(half* nocapture readonly %x, float* noalias nocapture %y) {
|
|||
; CHECK-NEXT: dls lr, lr
|
||||
; CHECK-NEXT: .LBB3_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: ldr r2, [r0]
|
||||
; CHECK-NEXT: ldr r3, [r0, #4]
|
||||
; CHECK-NEXT: adds r0, #8
|
||||
; CHECK-NEXT: vmov.32 q1[0], r2
|
||||
; CHECK-NEXT: vmov.32 q1[1], r3
|
||||
; CHECK-NEXT: vmovx.f16 s10, s5
|
||||
; CHECK-NEXT: vmovx.f16 s8, s4
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s15, s10
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s14, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s13, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s12, s4
|
||||
; CHECK-NEXT: vmul.f32 q1, q3, q0
|
||||
; CHECK-NEXT: vldrh.u32 q1, [r0], #8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||
; CHECK-NEXT: vstrb.8 q1, [r1], #16
|
||||
; CHECK-NEXT: le lr, .LBB3_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
||||
|
@ -215,34 +206,22 @@ define void @from_8(half* nocapture readonly %x, float* noalias nocapture %y) {
|
|||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: .vsave {d8, d9}
|
||||
; CHECK-NEXT: vpush {d8, d9}
|
||||
; CHECK-NEXT: adr r2, .LCPI4_0
|
||||
; CHECK-NEXT: mov.w lr, #128
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||||
; CHECK-NEXT: dls lr, lr
|
||||
; CHECK-NEXT: .LBB4_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r0], #16
|
||||
; CHECK-NEXT: vmovx.f16 s8, s5
|
||||
; CHECK-NEXT: vmovx.f16 s13, s7
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s11, s8
|
||||
; CHECK-NEXT: vmovx.f16 s14, s6
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s10, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s19, s13
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s18, s7
|
||||
; CHECK-NEXT: vmovx.f16 s12, s4
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s17, s14
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s16, s6
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s9, s12
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s8, s4
|
||||
; CHECK-NEXT: vmul.f32 q1, q4, q0
|
||||
; CHECK-NEXT: vldrh.u32 q1, [r0, #8]
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||
; CHECK-NEXT: vstrw.32 q1, [r1, #16]
|
||||
; CHECK-NEXT: vmul.f32 q1, q2, q0
|
||||
; CHECK-NEXT: vldrh.u32 q1, [r0], #16
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||
; CHECK-NEXT: vstrw.32 q1, [r1], #32
|
||||
; CHECK-NEXT: le lr, .LBB4_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: vpop {d8, d9}
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
; CHECK-NEXT: .p2align 4
|
||||
; CHECK-NEXT: @ %bb.3:
|
||||
|
@ -277,51 +256,30 @@ define void @from_16(half* nocapture readonly %x, float* noalias nocapture %y) {
|
|||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
|
||||
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
|
||||
; CHECK-NEXT: adr r2, .LCPI5_0
|
||||
; CHECK-NEXT: mov.w lr, #64
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||||
; CHECK-NEXT: dls lr, lr
|
||||
; CHECK-NEXT: .LBB5_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r0], #32
|
||||
; CHECK-NEXT: vmovx.f16 s12, s5
|
||||
; CHECK-NEXT: vldrh.u16 q2, [r0, #-16]
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s15, s12
|
||||
; CHECK-NEXT: vmovx.f16 s18, s4
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s14, s5
|
||||
; CHECK-NEXT: vmovx.f16 s16, s7
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s13, s18
|
||||
; CHECK-NEXT: vmovx.f16 s20, s9
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s12, s4
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s19, s16
|
||||
; CHECK-NEXT: vmovx.f16 s22, s6
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s18, s7
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s17, s22
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s23, s20
|
||||
; CHECK-NEXT: vmovx.f16 s28, s11
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s22, s9
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s31, s28
|
||||
; CHECK-NEXT: vmovx.f16 s26, s10
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s30, s11
|
||||
; CHECK-NEXT: vmovx.f16 s24, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s29, s26
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s28, s10
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s21, s24
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s20, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s16, s6
|
||||
; CHECK-NEXT: vmul.f32 q1, q7, q0
|
||||
; CHECK-NEXT: vldrh.u32 q1, [r0, #24]
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||
; CHECK-NEXT: vstrw.32 q1, [r1, #48]
|
||||
; CHECK-NEXT: vmul.f32 q1, q5, q0
|
||||
; CHECK-NEXT: vldrh.u32 q1, [r0, #16]
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||
; CHECK-NEXT: vstrw.32 q1, [r1, #32]
|
||||
; CHECK-NEXT: vmul.f32 q1, q4, q0
|
||||
; CHECK-NEXT: vldrh.u32 q1, [r0, #8]
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||
; CHECK-NEXT: vstrw.32 q1, [r1, #16]
|
||||
; CHECK-NEXT: vmul.f32 q1, q3, q0
|
||||
; CHECK-NEXT: vldrh.u32 q1, [r0], #32
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||
; CHECK-NEXT: vstrw.32 q1, [r1], #64
|
||||
; CHECK-NEXT: le lr, .LBB5_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
; CHECK-NEXT: .p2align 4
|
||||
; CHECK-NEXT: @ %bb.3:
|
||||
|
@ -362,18 +320,9 @@ define void @both_4(half* nocapture readonly %x, half* noalias nocapture %y) {
|
|||
; CHECK-NEXT: dls lr, lr
|
||||
; CHECK-NEXT: .LBB6_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: ldr r2, [r0]
|
||||
; CHECK-NEXT: ldr r3, [r0, #4]
|
||||
; CHECK-NEXT: adds r0, #8
|
||||
; CHECK-NEXT: vmov.32 q1[0], r2
|
||||
; CHECK-NEXT: vmov.32 q1[1], r3
|
||||
; CHECK-NEXT: vmovx.f16 s10, s5
|
||||
; CHECK-NEXT: vmovx.f16 s8, s4
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s15, s10
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s14, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s13, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s12, s4
|
||||
; CHECK-NEXT: vmul.f32 q1, q3, q0
|
||||
; CHECK-NEXT: vldrh.u32 q1, [r0], #8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
||||
; CHECK-NEXT: vstrh.32 q1, [r1], #8
|
||||
; CHECK-NEXT: le lr, .LBB6_1
|
||||
|
@ -419,23 +368,14 @@ define void @both_8(half* nocapture readonly %x, half* noalias nocapture %y) {
|
|||
; CHECK-NEXT: dls lr, lr
|
||||
; CHECK-NEXT: .LBB7_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r0], #16
|
||||
; CHECK-NEXT: vmovx.f16 s10, s7
|
||||
; CHECK-NEXT: vmovx.f16 s8, s6
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s15, s10
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s14, s7
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s13, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s12, s6
|
||||
; CHECK-NEXT: vmul.f32 q2, q3, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q2, q2
|
||||
; CHECK-NEXT: vstrh.32 q2, [r1, #8]
|
||||
; CHECK-NEXT: vmovx.f16 s10, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s15, s10
|
||||
; CHECK-NEXT: vmovx.f16 s8, s4
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s14, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s13, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s12, s4
|
||||
; CHECK-NEXT: vmul.f32 q1, q3, q0
|
||||
; CHECK-NEXT: vldrh.u32 q1, [r0, #8]
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
||||
; CHECK-NEXT: vstrh.32 q1, [r1, #8]
|
||||
; CHECK-NEXT: vldrh.u32 q1, [r0], #16
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
||||
; CHECK-NEXT: vstrh.32 q1, [r1], #16
|
||||
; CHECK-NEXT: le lr, .LBB7_1
|
||||
|
@ -481,42 +421,24 @@ define void @both_16(half* nocapture readonly %x, half* noalias nocapture %y) {
|
|||
; CHECK-NEXT: dls lr, lr
|
||||
; CHECK-NEXT: .LBB8_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r0, #16]
|
||||
; CHECK-NEXT: vmovx.f16 s10, s7
|
||||
; CHECK-NEXT: vmovx.f16 s8, s6
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s15, s10
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s14, s7
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s13, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s12, s6
|
||||
; CHECK-NEXT: vmul.f32 q2, q3, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q2, q2
|
||||
; CHECK-NEXT: vstrh.32 q2, [r1, #24]
|
||||
; CHECK-NEXT: vmovx.f16 s10, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s15, s10
|
||||
; CHECK-NEXT: vmovx.f16 s8, s4
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s14, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s13, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s12, s4
|
||||
; CHECK-NEXT: vmul.f32 q1, q3, q0
|
||||
; CHECK-NEXT: vldrh.u32 q1, [r0, #24]
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
||||
; CHECK-NEXT: vstrh.32 q1, [r1, #24]
|
||||
; CHECK-NEXT: vldrh.u32 q1, [r0, #16]
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
||||
; CHECK-NEXT: vstrh.32 q1, [r1, #16]
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r0], #32
|
||||
; CHECK-NEXT: vmovx.f16 s10, s7
|
||||
; CHECK-NEXT: vmovx.f16 s8, s6
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s15, s10
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s14, s7
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s13, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s12, s6
|
||||
; CHECK-NEXT: vmul.f32 q2, q3, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q2, q2
|
||||
; CHECK-NEXT: vstrh.32 q2, [r1, #8]
|
||||
; CHECK-NEXT: vmovx.f16 s10, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s15, s10
|
||||
; CHECK-NEXT: vmovx.f16 s8, s4
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s14, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s13, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s12, s4
|
||||
; CHECK-NEXT: vmul.f32 q1, q3, q0
|
||||
; CHECK-NEXT: vldrh.u32 q1, [r0, #8]
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
||||
; CHECK-NEXT: vstrh.32 q1, [r1, #8]
|
||||
; CHECK-NEXT: vldrh.u32 q1, [r0], #32
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
||||
; CHECK-NEXT: vstrh.32 q1, [r1], #32
|
||||
; CHECK-NEXT: le lr, .LBB8_1
|
||||
|
|
|
@ -217,15 +217,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x float> @load_ext_4(<4 x half>* %src) {
|
||||
; CHECK-LABEL: load_ext_4:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: ldrd r1, r0, [r0]
|
||||
; CHECK-NEXT: vmov.32 q1[0], r1
|
||||
; CHECK-NEXT: vmov.32 q1[1], r0
|
||||
; CHECK-NEXT: vmovx.f16 s0, s5
|
||||
; CHECK-NEXT: vmovx.f16 s8, s4
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s3, s0
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s2, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s1, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s0, s4
|
||||
; CHECK-NEXT: vldrh.u32 q0, [r0]
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q0, q0
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%wide.load = load <4 x half>, <4 x half>* %src, align 4
|
||||
|
@ -236,19 +229,10 @@ entry:
|
|||
define arm_aapcs_vfpcc <8 x float> @load_ext_8(<8 x half>* %src) {
|
||||
; CHECK-LABEL: load_ext_8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0]
|
||||
; CHECK-NEXT: vmovx.f16 s0, s9
|
||||
; CHECK-NEXT: vmovx.f16 s6, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s3, s0
|
||||
; CHECK-NEXT: vmovx.f16 s4, s11
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s2, s9
|
||||
; CHECK-NEXT: vmovx.f16 s12, s10
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s1, s6
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s0, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s7, s4
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s6, s11
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s5, s12
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s4, s10
|
||||
; CHECK-NEXT: vldrh.u32 q0, [r0]
|
||||
; CHECK-NEXT: vldrh.u32 q1, [r0, #8]
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q0, q0
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%wide.load = load <8 x half>, <8 x half>* %src, align 4
|
||||
|
@ -259,35 +243,14 @@ entry:
|
|||
define arm_aapcs_vfpcc <16 x float> @load_ext_16(<16 x half>* %src) {
|
||||
; CHECK-LABEL: load_ext_16:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .vsave {d8, d9, d10}
|
||||
; CHECK-NEXT: vpush {d8, d9, d10}
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r0]
|
||||
; CHECK-NEXT: vldrw.u32 q4, [r0, #16]
|
||||
; CHECK-NEXT: vmovx.f16 s0, s9
|
||||
; CHECK-NEXT: vmovx.f16 s6, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s3, s0
|
||||
; CHECK-NEXT: vmovx.f16 s4, s11
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s2, s9
|
||||
; CHECK-NEXT: vmovx.f16 s15, s10
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s1, s6
|
||||
; CHECK-NEXT: vmovx.f16 s13, s17
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s0, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s7, s4
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s6, s11
|
||||
; CHECK-NEXT: vmovx.f16 s14, s16
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s5, s15
|
||||
; CHECK-NEXT: vmovx.f16 s12, s19
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s4, s10
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s11, s13
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s10, s17
|
||||
; CHECK-NEXT: vmovx.f16 s20, s18
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s9, s14
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s8, s16
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s15, s12
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s14, s19
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s13, s20
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s12, s18
|
||||
; CHECK-NEXT: vpop {d8, d9, d10}
|
||||
; CHECK-NEXT: vldrh.u32 q0, [r0]
|
||||
; CHECK-NEXT: vldrh.u32 q1, [r0, #8]
|
||||
; CHECK-NEXT: vldrh.u32 q2, [r0, #16]
|
||||
; CHECK-NEXT: vldrh.u32 q3, [r0, #24]
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q0, q0
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q2, q2
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q3, q3
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%wide.load = load <16 x half>, <16 x half>* %src, align 4
|
||||
|
|
|
@ -134,14 +134,14 @@ entry:
|
|||
define void @foo_int32_int8_double(<16 x i32>* %dest, <16 x i8>* readonly %src, i32 %n) {
|
||||
; CHECK-LABEL: foo_int32_int8_double:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrb.s32 q0, [r1, #4]
|
||||
; CHECK-NEXT: vldrb.s32 q1, [r1]
|
||||
; CHECK-NEXT: vldrb.s32 q2, [r1, #12]
|
||||
; CHECK-NEXT: vldrb.s32 q3, [r1, #8]
|
||||
; CHECK-NEXT: vstrw.32 q1, [r0]
|
||||
; CHECK-NEXT: vstrw.32 q0, [r0, #16]
|
||||
; CHECK-NEXT: vstrw.32 q3, [r0, #32]
|
||||
; CHECK-NEXT: vstrw.32 q2, [r0, #48]
|
||||
; CHECK-NEXT: vldrb.s32 q0, [r1]
|
||||
; CHECK-NEXT: vldrb.s32 q1, [r1, #4]
|
||||
; CHECK-NEXT: vldrb.s32 q2, [r1, #8]
|
||||
; CHECK-NEXT: vldrb.s32 q3, [r1, #12]
|
||||
; CHECK-NEXT: vstrw.32 q1, [r0, #16]
|
||||
; CHECK-NEXT: vstrw.32 q0, [r0]
|
||||
; CHECK-NEXT: vstrw.32 q3, [r0, #48]
|
||||
; CHECK-NEXT: vstrw.32 q2, [r0, #32]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%wide.load = load <16 x i8>, <16 x i8>* %src, align 1
|
||||
|
@ -224,14 +224,14 @@ entry:
|
|||
define void @foo_uint32_uint8_double(<16 x i32>* %dest, <16 x i8>* readonly %src, i32 %n) {
|
||||
; CHECK-LABEL: foo_uint32_uint8_double:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrb.u32 q0, [r1, #4]
|
||||
; CHECK-NEXT: vldrb.u32 q1, [r1]
|
||||
; CHECK-NEXT: vldrb.u32 q2, [r1, #12]
|
||||
; CHECK-NEXT: vldrb.u32 q3, [r1, #8]
|
||||
; CHECK-NEXT: vstrw.32 q1, [r0]
|
||||
; CHECK-NEXT: vstrw.32 q0, [r0, #16]
|
||||
; CHECK-NEXT: vstrw.32 q3, [r0, #32]
|
||||
; CHECK-NEXT: vstrw.32 q2, [r0, #48]
|
||||
; CHECK-NEXT: vldrb.u32 q0, [r1]
|
||||
; CHECK-NEXT: vldrb.u32 q1, [r1, #4]
|
||||
; CHECK-NEXT: vldrb.u32 q2, [r1, #8]
|
||||
; CHECK-NEXT: vldrb.u32 q3, [r1, #12]
|
||||
; CHECK-NEXT: vstrw.32 q1, [r0, #16]
|
||||
; CHECK-NEXT: vstrw.32 q0, [r0]
|
||||
; CHECK-NEXT: vstrw.32 q3, [r0, #48]
|
||||
; CHECK-NEXT: vstrw.32 q2, [r0, #32]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%wide.load = load <16 x i8>, <16 x i8>* %src, align 1
|
||||
|
@ -347,12 +347,12 @@ define <16 x i16>* @foo_uint32_uint16_quad_offset(<16 x i32>* %dest, <16 x i16>*
|
|||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrh.s32 q0, [r1, #32]!
|
||||
; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
|
||||
; CHECK-NEXT: vldrh.s32 q2, [r1, #24]
|
||||
; CHECK-NEXT: vldrh.s32 q3, [r1, #16]
|
||||
; CHECK-NEXT: vldrh.s32 q2, [r1, #16]
|
||||
; CHECK-NEXT: vldrh.s32 q3, [r1, #24]
|
||||
; CHECK-NEXT: vstrw.32 q0, [r0]
|
||||
; CHECK-NEXT: vstrw.32 q2, [r0, #48]
|
||||
; CHECK-NEXT: vstrw.32 q2, [r0, #32]
|
||||
; CHECK-NEXT: vstrw.32 q1, [r0, #16]
|
||||
; CHECK-NEXT: vstrw.32 q3, [r0, #32]
|
||||
; CHECK-NEXT: vstrw.32 q3, [r0, #48]
|
||||
; CHECK-NEXT: mov r0, r1
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
|
|
Loading…
Reference in New Issue