forked from OSchip/llvm-project
[ARM] MVE VCVT lowering for f16->f32 extends
This adds code to lower f16 to f32 fp_exts's using an MVE VCVT instructions, similar to a recent similar patch for fp_trunc. Again it goes through the lowering of a BUILD_VECTOR, but is slightly simpler only having to deal with interleaved indices. It adds a VCVTL node to lower to, similar to VCVTN. Differential Revision: https://reviews.llvm.org/D81339
This commit is contained in:
parent
6673d69226
commit
8532b2ee89
|
@ -1693,6 +1693,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|||
case ARMISD::VQMOVNs: return "ARMISD::VQMOVNs";
|
||||
case ARMISD::VQMOVNu: return "ARMISD::VQMOVNu";
|
||||
case ARMISD::VCVTN: return "ARMISD::VCVTN";
|
||||
case ARMISD::VCVTL: return "ARMISD::VCVTL";
|
||||
case ARMISD::VMULLs: return "ARMISD::VMULLs";
|
||||
case ARMISD::VMULLu: return "ARMISD::VMULLu";
|
||||
case ARMISD::VADDVs: return "ARMISD::VADDVs";
|
||||
|
@ -7244,7 +7245,7 @@ static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top) {
|
|||
return true;
|
||||
}
|
||||
|
||||
// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extract
|
||||
// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
|
||||
// from a pair of inputs. For example:
|
||||
// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
|
||||
// FP_ROUND(EXTRACT_ELT(Y, 0),
|
||||
|
@ -7298,6 +7299,50 @@ static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG,
|
|||
DAG.getConstant(1, dl, MVT::i32));
|
||||
}
|
||||
|
||||
// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
|
||||
// from a single input on alternating lanes. For example:
|
||||
// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
|
||||
// FP_ROUND(EXTRACT_ELT(X, 2),
|
||||
// FP_ROUND(EXTRACT_ELT(X, 4), ...)
|
||||
static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG,
|
||||
const ARMSubtarget *ST) {
|
||||
assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
|
||||
if (!ST->hasMVEFloatOps())
|
||||
return SDValue();
|
||||
|
||||
SDLoc dl(BV);
|
||||
EVT VT = BV.getValueType();
|
||||
if (VT != MVT::v4f32)
|
||||
return SDValue();
|
||||
|
||||
// We are looking for a buildvector of fptext elements, where all the
|
||||
// elements are alternating lanes from a single source. For example <0,2,4,6>
|
||||
// or <1,3,5,7>. Check the first two items are valid enough and extract some
|
||||
// info from them (they are checked properly in the loop below).
|
||||
if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
|
||||
BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT)
|
||||
return SDValue();
|
||||
SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
|
||||
int Offset = BV.getOperand(0).getOperand(0).getConstantOperandVal(1);
|
||||
if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
|
||||
return SDValue();
|
||||
|
||||
// Check all the values in the BuildVector line up with our expectations.
|
||||
for (unsigned i = 1; i < 4; i++) {
|
||||
auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
|
||||
return Trunc.getOpcode() == ISD::FP_EXTEND &&
|
||||
Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
|
||||
Trunc.getOperand(0).getOperand(0) == Op &&
|
||||
Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
|
||||
};
|
||||
if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
|
||||
DAG.getConstant(Offset, dl, MVT::i32));
|
||||
}
|
||||
|
||||
// If N is an integer constant that can be moved into a register in one
|
||||
// instruction, return an SDValue of such a constant (will become a MOV
|
||||
// instruction). Otherwise return null.
|
||||
|
@ -7560,9 +7605,12 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
|
|||
if (SDValue shuffle = ReconstructShuffle(Op, DAG))
|
||||
return shuffle;
|
||||
|
||||
// Attempt to turn a buildvector of scalar fptrunc's back into VCVT's
|
||||
// Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
|
||||
// VCVT's
|
||||
if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
|
||||
return VCVT;
|
||||
if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
|
||||
return VCVT;
|
||||
|
||||
if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
|
||||
// If we haven't found an efficient lowering, try splitting a 128-bit vector
|
||||
|
|
|
@ -210,6 +210,7 @@ class VectorType;
|
|||
|
||||
// MVE float <> half converts
|
||||
VCVTN, // MVE vcvt f32 -> f16, truncating into either the bottom or top lanes
|
||||
VCVTL, // MVE vcvt f16 -> f32, extending from either the bottom or top lanes
|
||||
|
||||
// Vector multiply long:
|
||||
VMULLs, // ...signed
|
||||
|
|
|
@ -4782,7 +4782,10 @@ class MVE_VCVT_ff<string iname, string suffix, bit op, bit T,
|
|||
let retainsPreviousHalfElement = 1;
|
||||
}
|
||||
|
||||
def SDTARMVCVTL : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
|
||||
SDTCisVT<2, i32>]>;
|
||||
def MVEvcvtn : SDNode<"ARMISD::VCVTN", SDTARMVMOVNQ>;
|
||||
def MVEvcvtl : SDNode<"ARMISD::VCVTL", SDTARMVCVTL>;
|
||||
|
||||
multiclass MVE_VCVT_f2h_m<string iname, int half> {
|
||||
def "": MVE_VCVT_ff<iname, "f16.f32", 0b0, half,
|
||||
|
@ -4816,6 +4819,9 @@ multiclass MVE_VCVT_h2f_m<string iname, int half> {
|
|||
(v4i1 VCCR:$mask))),
|
||||
(v4f32 (Inst (v8f16 MQPR:$Qm), ARMVCCThen,
|
||||
(v4i1 VCCR:$mask), (v4f32 MQPR:$inactive)))>;
|
||||
|
||||
def : Pat<(v4f32 (MVEvcvtl (v8f16 MQPR:$Qm), (i32 half))),
|
||||
(v4f32 (Inst (v8f16 MQPR:$Qm)))>;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -556,8 +556,6 @@ define void @both_8_I(half* nocapture readonly %x, half* noalias nocapture %y) {
|
|||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: .vsave {d8, d9}
|
||||
; CHECK-NEXT: vpush {d8, d9}
|
||||
; CHECK-NEXT: adr r2, .LCPI9_0
|
||||
; CHECK-NEXT: mov.w lr, #128
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||||
|
@ -565,26 +563,15 @@ define void @both_8_I(half* nocapture readonly %x, half* noalias nocapture %y) {
|
|||
; CHECK-NEXT: .LBB9_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r0], #16
|
||||
; CHECK-NEXT: vmovx.f16 s8, s7
|
||||
; CHECK-NEXT: vmovx.f16 s14, s6
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s11, s8
|
||||
; CHECK-NEXT: vmovx.f16 s13, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s10, s14
|
||||
; CHECK-NEXT: vmovx.f16 s12, s4
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s9, s13
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s19, s7
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s18, s6
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s17, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s16, s4
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s8, s12
|
||||
; CHECK-NEXT: vmul.f32 q1, q4, q0
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q2, q1
|
||||
; CHECK-NEXT: vcvtt.f32.f16 q1, q1
|
||||
; CHECK-NEXT: vmul.f32 q2, q2, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
||||
; CHECK-NEXT: vcvtt.f16.f32 q1, q2
|
||||
; CHECK-NEXT: vstrb.8 q1, [r1], #16
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q2, q2
|
||||
; CHECK-NEXT: vcvtt.f16.f32 q2, q1
|
||||
; CHECK-NEXT: vstrb.8 q2, [r1], #16
|
||||
; CHECK-NEXT: le lr, .LBB9_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: vpop {d8, d9}
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
; CHECK-NEXT: .p2align 4
|
||||
; CHECK-NEXT: @ %bb.3:
|
||||
|
@ -625,8 +612,6 @@ define void @both_16_I(half* nocapture readonly %x, half* noalias nocapture %y)
|
|||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: .vsave {d8, d9}
|
||||
; CHECK-NEXT: vpush {d8, d9}
|
||||
; CHECK-NEXT: adr r2, .LCPI10_0
|
||||
; CHECK-NEXT: mov.w lr, #128
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||||
|
@ -634,44 +619,23 @@ define void @both_16_I(half* nocapture readonly %x, half* noalias nocapture %y)
|
|||
; CHECK-NEXT: .LBB10_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r0]
|
||||
; CHECK-NEXT: vmovx.f16 s8, s7
|
||||
; CHECK-NEXT: vmovx.f16 s14, s6
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s11, s8
|
||||
; CHECK-NEXT: vmovx.f16 s13, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s10, s14
|
||||
; CHECK-NEXT: vmovx.f16 s12, s4
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s9, s13
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s19, s7
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s18, s6
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s17, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s16, s4
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s8, s12
|
||||
; CHECK-NEXT: vmul.f32 q1, q4, q0
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q2, q1
|
||||
; CHECK-NEXT: vcvtt.f32.f16 q1, q1
|
||||
; CHECK-NEXT: vmul.f32 q2, q2, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
||||
; CHECK-NEXT: vcvtt.f16.f32 q1, q2
|
||||
; CHECK-NEXT: vstrh.16 q1, [r1]
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q2, q2
|
||||
; CHECK-NEXT: vcvtt.f16.f32 q2, q1
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r0, #16]!
|
||||
; CHECK-NEXT: vmovx.f16 s12, s7
|
||||
; CHECK-NEXT: vmovx.f16 s14, s6
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s19, s12
|
||||
; CHECK-NEXT: vmovx.f16 s8, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s18, s14
|
||||
; CHECK-NEXT: vmovx.f16 s10, s4
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s17, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s16, s10
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s11, s7
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s10, s6
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s9, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s8, s4
|
||||
; CHECK-NEXT: vmul.f32 q1, q4, q0
|
||||
; CHECK-NEXT: vstrh.16 q2, [r1]
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q2, q1
|
||||
; CHECK-NEXT: vcvtt.f32.f16 q1, q1
|
||||
; CHECK-NEXT: vmul.f32 q2, q2, q0
|
||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||
; CHECK-NEXT: vcvtb.f16.f32 q2, q2
|
||||
; CHECK-NEXT: vcvtt.f16.f32 q2, q1
|
||||
; CHECK-NEXT: vstrb.8 q2, [r1, #16]!
|
||||
; CHECK-NEXT: le lr, .LBB10_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
||||
; CHECK-NEXT: vpop {d8, d9}
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
; CHECK-NEXT: .p2align 4
|
||||
; CHECK-NEXT: @ %bb.3:
|
||||
|
|
|
@ -196,11 +196,7 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x float> @fpext_0246(<8 x half> %src) {
|
||||
; CHECK-LABEL: fpext_0246:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s7, s3
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s6, s2
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s5, s1
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s4, s0
|
||||
; CHECK-NEXT: vmov q0, q1
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q0, q0
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%strided.vec = shufflevector <8 x half> %src, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
|
@ -211,14 +207,7 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x float> @fpext_1357(<8 x half> %src) {
|
||||
; CHECK-LABEL: fpext_1357:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmovx.f16 s8, s3
|
||||
; CHECK-NEXT: vmovx.f16 s4, s1
|
||||
; CHECK-NEXT: vmovx.f16 s6, s0
|
||||
; CHECK-NEXT: vmovx.f16 s10, s2
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s3, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s2, s10
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s1, s4
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s0, s6
|
||||
; CHECK-NEXT: vcvtt.f32.f16 q0, q0
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%strided.vec = shufflevector <8 x half> %src, <8 x half> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
||||
|
@ -229,16 +218,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <8 x float> @fpext_02468101214(<16 x half> %src) {
|
||||
; CHECK-LABEL: fpext_02468101214:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s11, s3
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s10, s2
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s9, s1
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s8, s0
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s15, s7
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s14, s6
|
||||
; CHECK-NEXT: vmov q0, q2
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s13, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s12, s4
|
||||
; CHECK-NEXT: vmov q1, q3
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q0, q0
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%strided.vec = shufflevector <16 x half> %src, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
|
||||
|
@ -249,22 +230,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <8 x float> @fpext_13579111315(<16 x half> %src) {
|
||||
; CHECK-LABEL: fpext_13579111315:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmovx.f16 s14, s6
|
||||
; CHECK-NEXT: vmovx.f16 s8, s5
|
||||
; CHECK-NEXT: vmovx.f16 s5, s3
|
||||
; CHECK-NEXT: vmovx.f16 s10, s4
|
||||
; CHECK-NEXT: vmovx.f16 s12, s7
|
||||
; CHECK-NEXT: vmovx.f16 s4, s1
|
||||
; CHECK-NEXT: vmovx.f16 s6, s0
|
||||
; CHECK-NEXT: vmovx.f16 s7, s2
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s3, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s2, s7
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s1, s4
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s0, s6
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s7, s12
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s6, s14
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s5, s8
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s4, s10
|
||||
; CHECK-NEXT: vcvtt.f32.f16 q0, q0
|
||||
; CHECK-NEXT: vcvtt.f32.f16 q1, q1
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%strided.vec = shufflevector <16 x half> %src, <16 x half> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
|
||||
|
|
|
@ -298,11 +298,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x float> @load_shuffleext_8(<8 x half>* %src) {
|
||||
; CHECK-LABEL: load_shuffleext_8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0]
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s3, s7
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s2, s6
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s1, s5
|
||||
; CHECK-NEXT: vcvtb.f32.f16 s0, s4
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0]
|
||||
; CHECK-NEXT: vcvtb.f32.f16 q0, q0
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%wide.load = load <8 x half>, <8 x half>* %src, align 4
|
||||
|
|
Loading…
Reference in New Issue