[AArch64][SVE] Duplicate FP_EXTEND/FP_TRUNC -> LOAD/STORE dag combines

By duplicating these dag combines we can bypass the legality checks that they do, this allows us to perform these combines on larger than legal fixed types, which in turn allows us to bring the same benefits D114580 brought but to larger than legal fixed types. Depends on D114580 Differential Revision: https://reviews.llvm.org/D114628
2021-11-26 12:01:22 +00:00 · 2021-11-26 12:01:22 +00:00 · fd9069ffce
parent ddce6e0561
commit fd9069ffce
2 changed files with 67 additions and 36 deletions
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -929,6 +929,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
  setTargetDAGCombine(ISD::VECREDUCE_ADD);
  setTargetDAGCombine(ISD::STEP_VECTOR);

+  setTargetDAGCombine(ISD::FP_EXTEND);
+
  setTargetDAGCombine(ISD::GlobalAddress);

  // In case of strict alignment, avoid an excessive number of byte wide stores.
@ -15967,6 +15969,22 @@ static SDValue performSTORECombine(SDNode *N,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   SelectionDAG &DAG,
                                   const AArch64Subtarget *Subtarget) {
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+  SDValue Chain = ST->getChain();
+  SDValue Value = ST->getValue();
+  SDValue Ptr = ST->getBasePtr();
+
+  // If this is an FP_ROUND followed by a store, fold this into a truncating
+  // store. We can do this even if this is already a truncstore.
+  // We purposefully don't care about legality of the nodes here as we know
+  // they can be split down into something legal.
+  if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
+      Value.getNode()->hasOneUse() && ST->isUnindexed() &&
+      Subtarget->useSVEForFixedLengthVectors() &&
+      Value.getValueType().isFixedLengthVector())
+    return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
+                             ST->getMemoryVT(), ST->getMemOperand());
+
  if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
    return Split;

@ -17309,6 +17327,37 @@ SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) {
  return DAG.getBitcast(Ty, Trunc);
 }

+SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
+                               TargetLowering::DAGCombinerInfo &DCI,
+                               const AArch64Subtarget *Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
+  if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
+    return SDValue();
+
+  // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
+  // We purposefully don't care about legality of the nodes here as we know
+  // they can be split down into something legal.
+  if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
+      N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
+      VT.isFixedLengthVector()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
+                                     LN0->getChain(), LN0->getBasePtr(),
+                                     N0.getValueType(), LN0->getMemOperand());
+    DCI.CombineTo(N, ExtLoad);
+    DCI.CombineTo(N0.getNode(),
+                  DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(),
+                              ExtLoad, DAG.getIntPtrConstant(1, SDLoc(N0))),
+                  ExtLoad.getValue(1));
+    return SDValue(N, 0); // Return N so it doesn't get rechecked!
+  }
+
+  return SDValue();
+}
+
 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                                 DAGCombinerInfo &DCI) const {
  SelectionDAG &DAG = DCI.DAG;
@ -17365,6 +17414,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
    return performSTORECombine(N, DCI, DAG, Subtarget);
  case ISD::VECTOR_SPLICE:
    return performSVESpliceCombine(N, DAG);
+  case ISD::FP_EXTEND:
+    return performFPExtendCombine(N, DAG, DCI, Subtarget);
  case AArch64ISD::BRCOND:
    return performBRCONDCombine(N, DCI, DAG);
  case AArch64ISD::TBNZ:
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
@ -63,17 +63,14 @@ define void @fcvt_v16f16_v16f32(<16 x half>* %a, <16 x float>* %b) #0 {
 ; Ensure sensible type legalisation.
 ; VBITS_EQ_256-LABEL: fcvt_v16f16_v16f32:
 ; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    ptrue p0.h, vl16
 ; VBITS_EQ_256-NEXT:    mov x8, #8
-; VBITS_EQ_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ptrue p0.s, vl8
-; VBITS_EQ_256-NEXT:    uunpklo z1.s, z0.h
-; VBITS_EQ_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_EQ_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_EQ_256-NEXT:    fcvt z1.s, p0/m, z1.h
+; VBITS_EQ_256-NEXT:    ld1sh { z0.s }, p0/z, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    ld1sh { z1.s }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    fcvt z0.s, p0/m, z0.h
-; VBITS_EQ_256-NEXT:    st1w { z1.s }, p0, [x1]
+; VBITS_EQ_256-NEXT:    fcvt z1.s, p0/m, z1.h
 ; VBITS_EQ_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    st1w { z1.s }, p0, [x1]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fcvt_v16f16_v16f32:
@ -164,18 +161,14 @@ define void @fcvt_v4f16_v4f64(<4 x half>* %a, <4 x double>* %b) #0 {
 define void @fcvt_v8f16_v8f64(<8 x half>* %a, <8 x double>* %b) #0 {
 ; VBITS_EQ_256-LABEL: fcvt_v8f16_v8f64:
 ; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    ldr q0, [x0]
 ; VBITS_EQ_256-NEXT:    mov x8, #4
 ; VBITS_EQ_256-NEXT:    ptrue p0.d, vl4
-; VBITS_EQ_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; VBITS_EQ_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_EQ_256-NEXT:    uunpklo z0.d, z0.s
+; VBITS_EQ_256-NEXT:    ld1sh { z0.d }, p0/z, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    ld1sh { z1.d }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    fcvt z0.d, p0/m, z0.h
-; VBITS_EQ_256-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_EQ_256-NEXT:    uunpklo z1.s, z1.h
-; VBITS_EQ_256-NEXT:    uunpklo z1.d, z1.s
 ; VBITS_EQ_256-NEXT:    fcvt z1.d, p0/m, z1.h
-; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [x1]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fcvt_v8f16_v8f64:
@ -263,17 +256,14 @@ define void @fcvt_v8f32_v8f64(<8 x float>* %a, <8 x double>* %b) #0 {
 ; Ensure sensible type legalisation.
 ; VBITS_EQ_256-LABEL: fcvt_v8f32_v8f64:
 ; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_EQ_256-NEXT:    mov x8, #4
-; VBITS_EQ_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ptrue p0.d, vl4
-; VBITS_EQ_256-NEXT:    uunpklo z1.d, z0.s
-; VBITS_EQ_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_EQ_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_EQ_256-NEXT:    fcvt z1.d, p0/m, z1.s
+; VBITS_EQ_256-NEXT:    ld1sw { z0.d }, p0/z, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    ld1sw { z1.d }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    fcvt z0.d, p0/m, z0.s
-; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [x1]
+; VBITS_EQ_256-NEXT:    fcvt z1.d, p0/m, z1.s
 ; VBITS_EQ_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [x1]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fcvt_v8f32_v8f64:
@ -366,15 +356,10 @@ define void @fcvt_v16f32_v16f16(<16 x float>* %a, <16 x half>* %b) #0 {
 ; VBITS_EQ_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_EQ_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_EQ_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
-; VBITS_EQ_256-NEXT:    ptrue p0.s
 ; VBITS_EQ_256-NEXT:    fcvt z0.h, p0/m, z0.s
 ; VBITS_EQ_256-NEXT:    fcvt z1.h, p0/m, z1.s
-; VBITS_EQ_256-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_EQ_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_EQ_256-NEXT:    ptrue p0.h, vl8
-; VBITS_EQ_256-NEXT:    splice z1.h, p0, z1.h, z0.h
-; VBITS_EQ_256-NEXT:    ptrue p0.h, vl16
-; VBITS_EQ_256-NEXT:    st1h { z1.h }, p0, [x1]
+; VBITS_EQ_256-NEXT:    st1h { z0.s }, p0, [x1, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    st1h { z1.s }, p0, [x1]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fcvt_v16f32_v16f16:
@ -573,15 +558,10 @@ define void @fcvt_v8f64_v8f32(<8 x double>* %a, <8 x float>* %b) #0 {
 ; VBITS_EQ_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_EQ_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_EQ_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
-; VBITS_EQ_256-NEXT:    ptrue p0.d
 ; VBITS_EQ_256-NEXT:    fcvt z0.s, p0/m, z0.d
 ; VBITS_EQ_256-NEXT:    fcvt z1.s, p0/m, z1.d
-; VBITS_EQ_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_EQ_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_EQ_256-NEXT:    ptrue p0.s, vl4
-; VBITS_EQ_256-NEXT:    splice z1.s, p0, z1.s, z0.s
-; VBITS_EQ_256-NEXT:    ptrue p0.s, vl8
-; VBITS_EQ_256-NEXT:    st1w { z1.s }, p0, [x1]
+; VBITS_EQ_256-NEXT:    st1w { z0.d }, p0, [x1, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    st1w { z1.d }, p0, [x1]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fcvt_v8f64_v8f32: