[ARM] Split FPExt loads

This extends PerformSplittingToWideningLoad to also handle FP_Ext, as
well as sign and zero extends. It uses an integer extending load
followed by a VCVTL on the bottom lanes to efficiently perform an fpext
on a smaller than legal type.

The existing code had to be rewritten a little to not just split the
node in two and let legalization handle it from there, but to actually
split into legal chunks.

Differential Revision: https://reviews.llvm.org/D81340
This commit is contained in:
David Green 2020-06-25 21:02:02 +01:00
parent 06be4bb5e6
commit d79b57b8bb
4 changed files with 136 additions and 220 deletions

View File

@ -967,6 +967,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::UMIN);
setTargetDAGCombine(ISD::SMAX);
setTargetDAGCombine(ISD::UMAX);
setTargetDAGCombine(ISD::FP_EXTEND);
}
if (!Subtarget->hasFP64()) {
@ -15062,9 +15063,10 @@ static SDValue PerformShiftCombine(SDNode *N,
return SDValue();
}
// Look for a sign/zero extend of a larger than legal load. This can be split
// into two extending loads, which are simpler to deal with than an arbitrary
// sign extend.
// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
// split into multiple extending loads, which are simpler to deal with than an
// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
// to convert the type to an f32.
static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
if (N0.getOpcode() != ISD::LOAD)
@ -15086,12 +15088,15 @@ static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
NumElements = 4;
if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
NumElements = 8;
if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
NumElements = 4;
if (NumElements == 0 ||
FromVT.getVectorNumElements() == NumElements ||
(FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
FromVT.getVectorNumElements() % NumElements != 0 ||
!isPowerOf2_32(NumElements))
return SDValue();
LLVMContext &C = *DAG.getContext();
SDLoc DL(LD);
// Details about the old load
SDValue Ch = LD->getChain();
@ -15103,28 +15108,43 @@ static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
ISD::LoadExtType NewExtType =
N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
EVT NewFromVT = FromVT.getHalfNumVectorElementsVT(*DAG.getContext());
EVT NewToVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
unsigned NewOffset = NewFromVT.getSizeInBits() / 8;
SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
EVT NewFromVT = EVT::getVectorVT(
C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
EVT NewToVT = EVT::getVectorVT(
C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
// Split the load in half, each side of which is extended separately. This
// is good enough, as legalisation will take it from there. They are either
// already legal or they will be split further into something that is
// legal.
SDValue NewLoad1 = DAG.getLoad(
ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, BasePtr, Offset,
LD->getPointerInfo(), NewFromVT, Alignment.value(), MMOFlags, AAInfo);
SDValue NewLoad2 =
DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
Alignment.value(), MMOFlags, AAInfo);
SmallVector<SDValue, 4> Loads;
SmallVector<SDValue, 4> Chains;
for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
SDValue(NewLoad1.getNode(), 1),
SDValue(NewLoad2.getNode(), 1));
SDValue NewLoad =
DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
Alignment.value(), MMOFlags, AAInfo);
Loads.push_back(NewLoad);
Chains.push_back(SDValue(NewLoad.getNode(), 1));
}
// Float truncs need to extended with VCVTB's into their floating point types.
if (FromEltVT == MVT::f16) {
SmallVector<SDValue, 4> Extends;
for (unsigned i = 0; i < Loads.size(); i++) {
SDValue LoadBC =
DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
DAG.getConstant(0, DL, MVT::i32));
Extends.push_back(FPExt);
}
Loads = Extends;
}
SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, NewLoad1, NewLoad2);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
}
/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
@ -15172,6 +15192,15 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *ST) {
if (ST->hasMVEFloatOps())
if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
return NewLoad;
return SDValue();
}
/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
/// saturates.
static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG,
@ -15830,6 +15859,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND:
return PerformExtendCombine(N, DCI.DAG, Subtarget);
case ISD::FP_EXTEND:
return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
case ISD::SMIN:
case ISD::UMIN:
case ISD::SMAX:

View File

@ -166,18 +166,9 @@ define void @from_4(half* nocapture readonly %x, float* noalias nocapture %y) {
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB3_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: ldr r3, [r0, #4]
; CHECK-NEXT: adds r0, #8
; CHECK-NEXT: vmov.32 q1[0], r2
; CHECK-NEXT: vmov.32 q1[1], r3
; CHECK-NEXT: vmovx.f16 s10, s5
; CHECK-NEXT: vmovx.f16 s8, s4
; CHECK-NEXT: vcvtb.f32.f16 s15, s10
; CHECK-NEXT: vcvtb.f32.f16 s14, s5
; CHECK-NEXT: vcvtb.f32.f16 s13, s8
; CHECK-NEXT: vcvtb.f32.f16 s12, s4
; CHECK-NEXT: vmul.f32 q1, q3, q0
; CHECK-NEXT: vldrh.u32 q1, [r0], #8
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
; CHECK-NEXT: vmul.f32 q1, q1, q0
; CHECK-NEXT: vstrb.8 q1, [r1], #16
; CHECK-NEXT: le lr, .LBB3_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
@ -215,34 +206,22 @@ define void @from_8(half* nocapture readonly %x, float* noalias nocapture %y) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: adr r2, .LCPI4_0
; CHECK-NEXT: mov.w lr, #128
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB4_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrh.u16 q1, [r0], #16
; CHECK-NEXT: vmovx.f16 s8, s5
; CHECK-NEXT: vmovx.f16 s13, s7
; CHECK-NEXT: vcvtb.f32.f16 s11, s8
; CHECK-NEXT: vmovx.f16 s14, s6
; CHECK-NEXT: vcvtb.f32.f16 s10, s5
; CHECK-NEXT: vcvtb.f32.f16 s19, s13
; CHECK-NEXT: vcvtb.f32.f16 s18, s7
; CHECK-NEXT: vmovx.f16 s12, s4
; CHECK-NEXT: vcvtb.f32.f16 s17, s14
; CHECK-NEXT: vcvtb.f32.f16 s16, s6
; CHECK-NEXT: vcvtb.f32.f16 s9, s12
; CHECK-NEXT: vcvtb.f32.f16 s8, s4
; CHECK-NEXT: vmul.f32 q1, q4, q0
; CHECK-NEXT: vldrh.u32 q1, [r0, #8]
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
; CHECK-NEXT: vmul.f32 q1, q1, q0
; CHECK-NEXT: vstrw.32 q1, [r1, #16]
; CHECK-NEXT: vmul.f32 q1, q2, q0
; CHECK-NEXT: vldrh.u32 q1, [r0], #16
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
; CHECK-NEXT: vmul.f32 q1, q1, q0
; CHECK-NEXT: vstrw.32 q1, [r1], #32
; CHECK-NEXT: le lr, .LBB4_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
@ -277,51 +256,30 @@ define void @from_16(half* nocapture readonly %x, float* noalias nocapture %y) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: adr r2, .LCPI5_0
; CHECK-NEXT: mov.w lr, #64
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB5_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrh.u16 q1, [r0], #32
; CHECK-NEXT: vmovx.f16 s12, s5
; CHECK-NEXT: vldrh.u16 q2, [r0, #-16]
; CHECK-NEXT: vcvtb.f32.f16 s15, s12
; CHECK-NEXT: vmovx.f16 s18, s4
; CHECK-NEXT: vcvtb.f32.f16 s14, s5
; CHECK-NEXT: vmovx.f16 s16, s7
; CHECK-NEXT: vcvtb.f32.f16 s13, s18
; CHECK-NEXT: vmovx.f16 s20, s9
; CHECK-NEXT: vcvtb.f32.f16 s12, s4
; CHECK-NEXT: vcvtb.f32.f16 s19, s16
; CHECK-NEXT: vmovx.f16 s22, s6
; CHECK-NEXT: vcvtb.f32.f16 s18, s7
; CHECK-NEXT: vcvtb.f32.f16 s17, s22
; CHECK-NEXT: vcvtb.f32.f16 s23, s20
; CHECK-NEXT: vmovx.f16 s28, s11
; CHECK-NEXT: vcvtb.f32.f16 s22, s9
; CHECK-NEXT: vcvtb.f32.f16 s31, s28
; CHECK-NEXT: vmovx.f16 s26, s10
; CHECK-NEXT: vcvtb.f32.f16 s30, s11
; CHECK-NEXT: vmovx.f16 s24, s8
; CHECK-NEXT: vcvtb.f32.f16 s29, s26
; CHECK-NEXT: vcvtb.f32.f16 s28, s10
; CHECK-NEXT: vcvtb.f32.f16 s21, s24
; CHECK-NEXT: vcvtb.f32.f16 s20, s8
; CHECK-NEXT: vcvtb.f32.f16 s16, s6
; CHECK-NEXT: vmul.f32 q1, q7, q0
; CHECK-NEXT: vldrh.u32 q1, [r0, #24]
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
; CHECK-NEXT: vmul.f32 q1, q1, q0
; CHECK-NEXT: vstrw.32 q1, [r1, #48]
; CHECK-NEXT: vmul.f32 q1, q5, q0
; CHECK-NEXT: vldrh.u32 q1, [r0, #16]
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
; CHECK-NEXT: vmul.f32 q1, q1, q0
; CHECK-NEXT: vstrw.32 q1, [r1, #32]
; CHECK-NEXT: vmul.f32 q1, q4, q0
; CHECK-NEXT: vldrh.u32 q1, [r0, #8]
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
; CHECK-NEXT: vmul.f32 q1, q1, q0
; CHECK-NEXT: vstrw.32 q1, [r1, #16]
; CHECK-NEXT: vmul.f32 q1, q3, q0
; CHECK-NEXT: vldrh.u32 q1, [r0], #32
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
; CHECK-NEXT: vmul.f32 q1, q1, q0
; CHECK-NEXT: vstrw.32 q1, [r1], #64
; CHECK-NEXT: le lr, .LBB5_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
@ -362,18 +320,9 @@ define void @both_4(half* nocapture readonly %x, half* noalias nocapture %y) {
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB6_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: ldr r3, [r0, #4]
; CHECK-NEXT: adds r0, #8
; CHECK-NEXT: vmov.32 q1[0], r2
; CHECK-NEXT: vmov.32 q1[1], r3
; CHECK-NEXT: vmovx.f16 s10, s5
; CHECK-NEXT: vmovx.f16 s8, s4
; CHECK-NEXT: vcvtb.f32.f16 s15, s10
; CHECK-NEXT: vcvtb.f32.f16 s14, s5
; CHECK-NEXT: vcvtb.f32.f16 s13, s8
; CHECK-NEXT: vcvtb.f32.f16 s12, s4
; CHECK-NEXT: vmul.f32 q1, q3, q0
; CHECK-NEXT: vldrh.u32 q1, [r0], #8
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
; CHECK-NEXT: vmul.f32 q1, q1, q0
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
; CHECK-NEXT: vstrh.32 q1, [r1], #8
; CHECK-NEXT: le lr, .LBB6_1
@ -419,23 +368,14 @@ define void @both_8(half* nocapture readonly %x, half* noalias nocapture %y) {
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB7_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrh.u16 q1, [r0], #16
; CHECK-NEXT: vmovx.f16 s10, s7
; CHECK-NEXT: vmovx.f16 s8, s6
; CHECK-NEXT: vcvtb.f32.f16 s15, s10
; CHECK-NEXT: vcvtb.f32.f16 s14, s7
; CHECK-NEXT: vcvtb.f32.f16 s13, s8
; CHECK-NEXT: vcvtb.f32.f16 s12, s6
; CHECK-NEXT: vmul.f32 q2, q3, q0
; CHECK-NEXT: vcvtb.f16.f32 q2, q2
; CHECK-NEXT: vstrh.32 q2, [r1, #8]
; CHECK-NEXT: vmovx.f16 s10, s5
; CHECK-NEXT: vcvtb.f32.f16 s15, s10
; CHECK-NEXT: vmovx.f16 s8, s4
; CHECK-NEXT: vcvtb.f32.f16 s14, s5
; CHECK-NEXT: vcvtb.f32.f16 s13, s8
; CHECK-NEXT: vcvtb.f32.f16 s12, s4
; CHECK-NEXT: vmul.f32 q1, q3, q0
; CHECK-NEXT: vldrh.u32 q1, [r0, #8]
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
; CHECK-NEXT: vmul.f32 q1, q1, q0
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
; CHECK-NEXT: vstrh.32 q1, [r1, #8]
; CHECK-NEXT: vldrh.u32 q1, [r0], #16
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
; CHECK-NEXT: vmul.f32 q1, q1, q0
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
; CHECK-NEXT: vstrh.32 q1, [r1], #16
; CHECK-NEXT: le lr, .LBB7_1
@ -481,42 +421,24 @@ define void @both_16(half* nocapture readonly %x, half* noalias nocapture %y) {
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB8_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrh.u16 q1, [r0, #16]
; CHECK-NEXT: vmovx.f16 s10, s7
; CHECK-NEXT: vmovx.f16 s8, s6
; CHECK-NEXT: vcvtb.f32.f16 s15, s10
; CHECK-NEXT: vcvtb.f32.f16 s14, s7
; CHECK-NEXT: vcvtb.f32.f16 s13, s8
; CHECK-NEXT: vcvtb.f32.f16 s12, s6
; CHECK-NEXT: vmul.f32 q2, q3, q0
; CHECK-NEXT: vcvtb.f16.f32 q2, q2
; CHECK-NEXT: vstrh.32 q2, [r1, #24]
; CHECK-NEXT: vmovx.f16 s10, s5
; CHECK-NEXT: vcvtb.f32.f16 s15, s10
; CHECK-NEXT: vmovx.f16 s8, s4
; CHECK-NEXT: vcvtb.f32.f16 s14, s5
; CHECK-NEXT: vcvtb.f32.f16 s13, s8
; CHECK-NEXT: vcvtb.f32.f16 s12, s4
; CHECK-NEXT: vmul.f32 q1, q3, q0
; CHECK-NEXT: vldrh.u32 q1, [r0, #24]
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
; CHECK-NEXT: vmul.f32 q1, q1, q0
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
; CHECK-NEXT: vstrh.32 q1, [r1, #24]
; CHECK-NEXT: vldrh.u32 q1, [r0, #16]
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
; CHECK-NEXT: vmul.f32 q1, q1, q0
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
; CHECK-NEXT: vstrh.32 q1, [r1, #16]
; CHECK-NEXT: vldrh.u16 q1, [r0], #32
; CHECK-NEXT: vmovx.f16 s10, s7
; CHECK-NEXT: vmovx.f16 s8, s6
; CHECK-NEXT: vcvtb.f32.f16 s15, s10
; CHECK-NEXT: vcvtb.f32.f16 s14, s7
; CHECK-NEXT: vcvtb.f32.f16 s13, s8
; CHECK-NEXT: vcvtb.f32.f16 s12, s6
; CHECK-NEXT: vmul.f32 q2, q3, q0
; CHECK-NEXT: vcvtb.f16.f32 q2, q2
; CHECK-NEXT: vstrh.32 q2, [r1, #8]
; CHECK-NEXT: vmovx.f16 s10, s5
; CHECK-NEXT: vcvtb.f32.f16 s15, s10
; CHECK-NEXT: vmovx.f16 s8, s4
; CHECK-NEXT: vcvtb.f32.f16 s14, s5
; CHECK-NEXT: vcvtb.f32.f16 s13, s8
; CHECK-NEXT: vcvtb.f32.f16 s12, s4
; CHECK-NEXT: vmul.f32 q1, q3, q0
; CHECK-NEXT: vldrh.u32 q1, [r0, #8]
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
; CHECK-NEXT: vmul.f32 q1, q1, q0
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
; CHECK-NEXT: vstrh.32 q1, [r1, #8]
; CHECK-NEXT: vldrh.u32 q1, [r0], #32
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
; CHECK-NEXT: vmul.f32 q1, q1, q0
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
; CHECK-NEXT: vstrh.32 q1, [r1], #32
; CHECK-NEXT: le lr, .LBB8_1

View File

@ -217,15 +217,8 @@ entry:
define arm_aapcs_vfpcc <4 x float> @load_ext_4(<4 x half>* %src) {
; CHECK-LABEL: load_ext_4:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldrd r1, r0, [r0]
; CHECK-NEXT: vmov.32 q1[0], r1
; CHECK-NEXT: vmov.32 q1[1], r0
; CHECK-NEXT: vmovx.f16 s0, s5
; CHECK-NEXT: vmovx.f16 s8, s4
; CHECK-NEXT: vcvtb.f32.f16 s3, s0
; CHECK-NEXT: vcvtb.f32.f16 s2, s5
; CHECK-NEXT: vcvtb.f32.f16 s1, s8
; CHECK-NEXT: vcvtb.f32.f16 s0, s4
; CHECK-NEXT: vldrh.u32 q0, [r0]
; CHECK-NEXT: vcvtb.f32.f16 q0, q0
; CHECK-NEXT: bx lr
entry:
%wide.load = load <4 x half>, <4 x half>* %src, align 4
@ -236,19 +229,10 @@ entry:
define arm_aapcs_vfpcc <8 x float> @load_ext_8(<8 x half>* %src) {
; CHECK-LABEL: load_ext_8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q2, [r0]
; CHECK-NEXT: vmovx.f16 s0, s9
; CHECK-NEXT: vmovx.f16 s6, s8
; CHECK-NEXT: vcvtb.f32.f16 s3, s0
; CHECK-NEXT: vmovx.f16 s4, s11
; CHECK-NEXT: vcvtb.f32.f16 s2, s9
; CHECK-NEXT: vmovx.f16 s12, s10
; CHECK-NEXT: vcvtb.f32.f16 s1, s6
; CHECK-NEXT: vcvtb.f32.f16 s0, s8
; CHECK-NEXT: vcvtb.f32.f16 s7, s4
; CHECK-NEXT: vcvtb.f32.f16 s6, s11
; CHECK-NEXT: vcvtb.f32.f16 s5, s12
; CHECK-NEXT: vcvtb.f32.f16 s4, s10
; CHECK-NEXT: vldrh.u32 q0, [r0]
; CHECK-NEXT: vldrh.u32 q1, [r0, #8]
; CHECK-NEXT: vcvtb.f32.f16 q0, q0
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
; CHECK-NEXT: bx lr
entry:
%wide.load = load <8 x half>, <8 x half>* %src, align 4
@ -259,35 +243,14 @@ entry:
define arm_aapcs_vfpcc <16 x float> @load_ext_16(<16 x half>* %src) {
; CHECK-LABEL: load_ext_16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9, d10}
; CHECK-NEXT: vpush {d8, d9, d10}
; CHECK-NEXT: vldrw.u32 q2, [r0]
; CHECK-NEXT: vldrw.u32 q4, [r0, #16]
; CHECK-NEXT: vmovx.f16 s0, s9
; CHECK-NEXT: vmovx.f16 s6, s8
; CHECK-NEXT: vcvtb.f32.f16 s3, s0
; CHECK-NEXT: vmovx.f16 s4, s11
; CHECK-NEXT: vcvtb.f32.f16 s2, s9
; CHECK-NEXT: vmovx.f16 s15, s10
; CHECK-NEXT: vcvtb.f32.f16 s1, s6
; CHECK-NEXT: vmovx.f16 s13, s17
; CHECK-NEXT: vcvtb.f32.f16 s0, s8
; CHECK-NEXT: vcvtb.f32.f16 s7, s4
; CHECK-NEXT: vcvtb.f32.f16 s6, s11
; CHECK-NEXT: vmovx.f16 s14, s16
; CHECK-NEXT: vcvtb.f32.f16 s5, s15
; CHECK-NEXT: vmovx.f16 s12, s19
; CHECK-NEXT: vcvtb.f32.f16 s4, s10
; CHECK-NEXT: vcvtb.f32.f16 s11, s13
; CHECK-NEXT: vcvtb.f32.f16 s10, s17
; CHECK-NEXT: vmovx.f16 s20, s18
; CHECK-NEXT: vcvtb.f32.f16 s9, s14
; CHECK-NEXT: vcvtb.f32.f16 s8, s16
; CHECK-NEXT: vcvtb.f32.f16 s15, s12
; CHECK-NEXT: vcvtb.f32.f16 s14, s19
; CHECK-NEXT: vcvtb.f32.f16 s13, s20
; CHECK-NEXT: vcvtb.f32.f16 s12, s18
; CHECK-NEXT: vpop {d8, d9, d10}
; CHECK-NEXT: vldrh.u32 q0, [r0]
; CHECK-NEXT: vldrh.u32 q1, [r0, #8]
; CHECK-NEXT: vldrh.u32 q2, [r0, #16]
; CHECK-NEXT: vldrh.u32 q3, [r0, #24]
; CHECK-NEXT: vcvtb.f32.f16 q0, q0
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
; CHECK-NEXT: vcvtb.f32.f16 q2, q2
; CHECK-NEXT: vcvtb.f32.f16 q3, q3
; CHECK-NEXT: bx lr
entry:
%wide.load = load <16 x half>, <16 x half>* %src, align 4

View File

@ -134,14 +134,14 @@ entry:
define void @foo_int32_int8_double(<16 x i32>* %dest, <16 x i8>* readonly %src, i32 %n) {
; CHECK-LABEL: foo_int32_int8_double:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.s32 q0, [r1, #4]
; CHECK-NEXT: vldrb.s32 q1, [r1]
; CHECK-NEXT: vldrb.s32 q2, [r1, #12]
; CHECK-NEXT: vldrb.s32 q3, [r1, #8]
; CHECK-NEXT: vstrw.32 q1, [r0]
; CHECK-NEXT: vstrw.32 q0, [r0, #16]
; CHECK-NEXT: vstrw.32 q3, [r0, #32]
; CHECK-NEXT: vstrw.32 q2, [r0, #48]
; CHECK-NEXT: vldrb.s32 q0, [r1]
; CHECK-NEXT: vldrb.s32 q1, [r1, #4]
; CHECK-NEXT: vldrb.s32 q2, [r1, #8]
; CHECK-NEXT: vldrb.s32 q3, [r1, #12]
; CHECK-NEXT: vstrw.32 q1, [r0, #16]
; CHECK-NEXT: vstrw.32 q0, [r0]
; CHECK-NEXT: vstrw.32 q3, [r0, #48]
; CHECK-NEXT: vstrw.32 q2, [r0, #32]
; CHECK-NEXT: bx lr
entry:
%wide.load = load <16 x i8>, <16 x i8>* %src, align 1
@ -224,14 +224,14 @@ entry:
define void @foo_uint32_uint8_double(<16 x i32>* %dest, <16 x i8>* readonly %src, i32 %n) {
; CHECK-LABEL: foo_uint32_uint8_double:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u32 q0, [r1, #4]
; CHECK-NEXT: vldrb.u32 q1, [r1]
; CHECK-NEXT: vldrb.u32 q2, [r1, #12]
; CHECK-NEXT: vldrb.u32 q3, [r1, #8]
; CHECK-NEXT: vstrw.32 q1, [r0]
; CHECK-NEXT: vstrw.32 q0, [r0, #16]
; CHECK-NEXT: vstrw.32 q3, [r0, #32]
; CHECK-NEXT: vstrw.32 q2, [r0, #48]
; CHECK-NEXT: vldrb.u32 q0, [r1]
; CHECK-NEXT: vldrb.u32 q1, [r1, #4]
; CHECK-NEXT: vldrb.u32 q2, [r1, #8]
; CHECK-NEXT: vldrb.u32 q3, [r1, #12]
; CHECK-NEXT: vstrw.32 q1, [r0, #16]
; CHECK-NEXT: vstrw.32 q0, [r0]
; CHECK-NEXT: vstrw.32 q3, [r0, #48]
; CHECK-NEXT: vstrw.32 q2, [r0, #32]
; CHECK-NEXT: bx lr
entry:
%wide.load = load <16 x i8>, <16 x i8>* %src, align 1
@ -347,12 +347,12 @@ define <16 x i16>* @foo_uint32_uint16_quad_offset(<16 x i32>* %dest, <16 x i16>*
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.s32 q0, [r1, #32]!
; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
; CHECK-NEXT: vldrh.s32 q2, [r1, #24]
; CHECK-NEXT: vldrh.s32 q3, [r1, #16]
; CHECK-NEXT: vldrh.s32 q2, [r1, #16]
; CHECK-NEXT: vldrh.s32 q3, [r1, #24]
; CHECK-NEXT: vstrw.32 q0, [r0]
; CHECK-NEXT: vstrw.32 q2, [r0, #48]
; CHECK-NEXT: vstrw.32 q2, [r0, #32]
; CHECK-NEXT: vstrw.32 q1, [r0, #16]
; CHECK-NEXT: vstrw.32 q3, [r0, #32]
; CHECK-NEXT: vstrw.32 q3, [r0, #48]
; CHECK-NEXT: mov r0, r1
; CHECK-NEXT: bx lr
entry: