[RISCV] Match splatted load to scalar load + splat. Form strided load during isel.

This modifies my previous patch to push the strided load formation to isel. This gives us opportunity to fold the splat into a .vx operation first. Using a scalar register and a .vx operation reduces vector register pressure which can be important for larger LMULs. If we can't fold the splat into a .vx operation, then it can make sense to use a strided load to free up the vector arithmetic ALU to do actual arithmetic rather than tying it up with vmv.v.x. Reviewed By: khchen Differential Revision: https://reviews.llvm.org/D101138
2021-04-26 13:23:22 -07:00 · 2021-04-26 13:23:22 -07:00 · e2cd92cb9b
parent b81244fa4f
commit e2cd92cb9b
5 changed files with 98 additions and 37 deletions
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@ -1138,6 +1138,44 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
    ReplaceNode(Node, Extract.getNode());
    return;
  }
+  case RISCVISD::VMV_V_X_VL:
+  case RISCVISD::VFMV_V_F_VL: {
+    // Try to match splat of a scalar load to a strided load with stride of x0.
+    SDValue Src = Node->getOperand(0);
+    auto *Ld = dyn_cast<LoadSDNode>(Src);
+    if (!Ld)
+      break;
+    EVT MemVT = Ld->getMemoryVT();
+    // The memory VT should be the same size as the element type.
+    if (MemVT.getStoreSize() != VT.getVectorElementType().getStoreSize())
+      break;
+    if (!IsProfitableToFold(Src, Node, Node) ||
+        !IsLegalToFold(Src, Node, Node, TM.getOptLevel()))
+      break;
+
+    SDValue VL;
+    selectVLOp(Node->getOperand(1), VL);
+
+    unsigned ScalarSize = VT.getScalarSizeInBits();
+    SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
+
+    SDValue Operands[] = {Ld->getBasePtr(),
+                          CurDAG->getRegister(RISCV::X0, XLenVT), VL, SEW,
+                          Ld->getChain()};
+
+    RISCVVLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
+    const RISCV::VLEPseudo *P = RISCV::getVLEPseudo(
+        /*IsMasked*/ false, /*IsStrided*/ true, /*FF*/ false, ScalarSize,
+        static_cast<unsigned>(LMUL));
+    MachineSDNode *Load =
+        CurDAG->getMachineNode(P->Pseudo, DL, Node->getVTList(), Operands);
+
+    if (auto *MemOp = dyn_cast<MemSDNode>(Node))
+      CurDAG->setNodeMemRefs(Load, {MemOp->getMemOperand()});
+
+    ReplaceNode(Node, Load);
+    return;
+  }
  }

  // Select the default instruction.
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@ -1632,17 +1632,40 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
        SDValue NewAddr = DAG.getMemBasePlusOffset(Ld->getBasePtr(),
                                                   TypeSize::Fixed(Offset), DL);

-        SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
-        SDValue IntID =
-            DAG.getTargetConstant(Intrinsic::riscv_vlse, DL, XLenVT);
-        SDValue Ops[] = {Ld->getChain(), IntID, NewAddr,
-                         DAG.getRegister(RISCV::X0, XLenVT), VL};
-        SDValue NewLoad = DAG.getMemIntrinsicNode(
-            ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, SVT,
-            DAG.getMachineFunction().getMachineMemOperand(
-                Ld->getMemOperand(), Offset, SVT.getStoreSize()));
-        DAG.makeEquivalentMemoryOrdering(Ld, NewLoad);
-        return convertFromScalableVector(VT, NewLoad, DAG, Subtarget);
+        // If this is SEW=64 on RV32, use a strided load with a stride of x0.
+        if (SVT.isInteger() && SVT.bitsGT(XLenVT)) {
+          SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
+          SDValue IntID =
+              DAG.getTargetConstant(Intrinsic::riscv_vlse, DL, XLenVT);
+          SDValue Ops[] = {Ld->getChain(), IntID, NewAddr,
+                           DAG.getRegister(RISCV::X0, XLenVT), VL};
+          SDValue NewLoad = DAG.getMemIntrinsicNode(
+              ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, SVT,
+              DAG.getMachineFunction().getMachineMemOperand(
+                  Ld->getMemOperand(), Offset, SVT.getStoreSize()));
+          DAG.makeEquivalentMemoryOrdering(Ld, NewLoad);
+          return convertFromScalableVector(VT, NewLoad, DAG, Subtarget);
+        }
+
+        // Otherwise use a scalar load and splat. This will give the best
+        // opportunity to fold a splat into the operation. ISel can turn it into
+        // the x0 strided load if we aren't able to fold away the select.
+        if (SVT.isFloatingPoint())
+          V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
+                          Ld->getPointerInfo().getWithOffset(Offset),
+                          Ld->getOriginalAlign(),
+                          Ld->getMemOperand()->getFlags());
+        else
+          V = DAG.getExtLoad(ISD::SEXTLOAD, DL, XLenVT, Ld->getChain(), NewAddr,
+                             Ld->getPointerInfo().getWithOffset(Offset), SVT,
+                             Ld->getOriginalAlign(),
+                             Ld->getMemOperand()->getFlags());
+        DAG.makeEquivalentMemoryOrdering(Ld, V);
+
+        unsigned Opc =
+            VT.isFloatingPoint() ? RISCVISD::VFMV_V_F_VL : RISCVISD::VMV_V_X_VL;
+        SDValue Splat = DAG.getNode(Opc, DL, ContainerVT, V, VL);
+        return convertFromScalableVector(VT, Splat, DAG, Subtarget);
      }

      V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@ -23,16 +23,16 @@ define void @buildvec_no_vid_v4f32(<4 x float>* %x) {
 define void @buildvec_dominant0_v4f32(<4 x float>* %x) {
 ; CHECK-LABEL: buildvec_dominant0_v4f32:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
 ; CHECK-NEXT:    lui a1, %hi(.LCPI1_0)
-; CHECK-NEXT:    flw ft0, %lo(.LCPI1_0)(a1)
-; CHECK-NEXT:    fmv.w.x ft1, zero
-; CHECK-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
-; CHECK-NEXT:    vfmv.s.f v25, ft1
-; CHECK-NEXT:    vfmv.v.f v26, ft0
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI1_0)
+; CHECK-NEXT:    vlse32.v v25, (a1), zero
+; CHECK-NEXT:    fmv.w.x ft0, zero
+; CHECK-NEXT:    vfmv.s.f v26, ft0
 ; CHECK-NEXT:    vsetivli a1, 3, e32,m1,tu,mu
-; CHECK-NEXT:    vslideup.vi v26, v25, 2
+; CHECK-NEXT:    vslideup.vi v25, v26, 2
 ; CHECK-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
-; CHECK-NEXT:    vse32.v v26, (a0)
+; CHECK-NEXT:    vse32.v v25, (a0)
 ; CHECK-NEXT:    ret
  store <4 x float> <float 2.0, float 2.0, float 0.0, float 2.0>, <4 x float>* %x
  ret void
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
@ -159,12 +159,12 @@ define <4 x double> @vrgather_shuffle_xv_v4f64(<4 x double> %x) {
 ; RV32-LABEL: vrgather_shuffle_xv_v4f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi a0, zero, 12
-; RV32-NEXT:    lui a1, %hi(.LCPI7_0)
-; RV32-NEXT:    fld ft0, %lo(.LCPI7_0)(a1)
 ; RV32-NEXT:    vsetivli a1, 1, e8,m1,ta,mu
 ; RV32-NEXT:    vmv.s.x v0, a0
-; RV32-NEXT:    vsetivli a0, 4, e64,m2,ta,mu
-; RV32-NEXT:    vfmv.v.f v26, ft0
+; RV32-NEXT:    lui a0, %hi(.LCPI7_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI7_0)
+; RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
+; RV32-NEXT:    vlse64.v v26, (a0), zero
 ; RV32-NEXT:    lui a0, %hi(.LCPI7_1)
 ; RV32-NEXT:    addi a0, a0, %lo(.LCPI7_1)
 ; RV32-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
@ -181,11 +181,11 @@ define <4 x double> @vrgather_shuffle_xv_v4f64(<4 x double> %x) {
 ; RV64-NEXT:    vmv.s.x v0, a0
 ; RV64-NEXT:    lui a0, %hi(.LCPI7_0)
 ; RV64-NEXT:    addi a0, a0, %lo(.LCPI7_0)
-; RV64-NEXT:    lui a1, %hi(.LCPI7_1)
-; RV64-NEXT:    fld ft0, %lo(.LCPI7_1)(a1)
 ; RV64-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
 ; RV64-NEXT:    vle64.v v28, (a0)
-; RV64-NEXT:    vfmv.v.f v26, ft0
+; RV64-NEXT:    lui a0, %hi(.LCPI7_1)
+; RV64-NEXT:    addi a0, a0, %lo(.LCPI7_1)
+; RV64-NEXT:    vlse64.v v26, (a0), zero
 ; RV64-NEXT:    vsetivli a0, 4, e64,m2,tu,mu
 ; RV64-NEXT:    vrgather.vv v26, v8, v28, v0.t
 ; RV64-NEXT:    vmv2r.v v8, v26
@ -203,12 +203,12 @@ define <4 x double> @vrgather_shuffle_vx_v4f64(<4 x double> %x) {
 ; RV32-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
 ; RV32-NEXT:    vmv.s.x v25, a0
 ; RV32-NEXT:    vmv.v.i v28, 0
-; RV32-NEXT:    lui a0, %hi(.LCPI8_0)
-; RV32-NEXT:    fld ft0, %lo(.LCPI8_0)(a0)
 ; RV32-NEXT:    vsetivli a0, 2, e16,m1,tu,mu
 ; RV32-NEXT:    vslideup.vi v28, v25, 1
-; RV32-NEXT:    vsetivli a0, 4, e64,m2,ta,mu
-; RV32-NEXT:    vfmv.v.f v26, ft0
+; RV32-NEXT:    lui a0, %hi(.LCPI8_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI8_0)
+; RV32-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
+; RV32-NEXT:    vlse64.v v26, (a0), zero
 ; RV32-NEXT:    vsetivli a0, 4, e64,m2,tu,mu
 ; RV32-NEXT:    vrgatherei16.vv v26, v8, v28, v0.t
 ; RV32-NEXT:    vmv2r.v v8, v26
@ -222,12 +222,12 @@ define <4 x double> @vrgather_shuffle_vx_v4f64(<4 x double> %x) {
 ; RV64-NEXT:    vmv.v.i v28, 0
 ; RV64-NEXT:    vsetivli a1, 2, e64,m2,tu,mu
 ; RV64-NEXT:    vslideup.vi v28, v26, 1
-; RV64-NEXT:    lui a1, %hi(.LCPI8_0)
-; RV64-NEXT:    fld ft0, %lo(.LCPI8_0)(a1)
 ; RV64-NEXT:    vsetivli a1, 1, e8,m1,ta,mu
 ; RV64-NEXT:    vmv.s.x v0, a0
-; RV64-NEXT:    vsetivli a0, 4, e64,m2,ta,mu
-; RV64-NEXT:    vfmv.v.f v26, ft0
+; RV64-NEXT:    lui a0, %hi(.LCPI8_0)
+; RV64-NEXT:    addi a0, a0, %lo(.LCPI8_0)
+; RV64-NEXT:    vsetivli a1, 4, e64,m2,ta,mu
+; RV64-NEXT:    vlse64.v v26, (a0), zero
 ; RV64-NEXT:    vsetivli a0, 4, e64,m2,tu,mu
 ; RV64-NEXT:    vrgather.vv v26, v8, v28, v0.t
 ; RV64-NEXT:    vmv2r.v v8, v26
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
@ -49,11 +49,11 @@ define void @insertelt_v3i64(<3 x i64>* %x, i64 %y) {
 ; RV32-NEXT:    vmv.v.i v28, 0
 ; RV32-NEXT:    vsetivli a3, 2, e64,m2,tu,mu
 ; RV32-NEXT:    vslideup.vi v28, v26, 0
-; RV32-NEXT:    lw a3, 20(a0)
+; RV32-NEXT:    addi a3, a0, 20
 ; RV32-NEXT:    vsetivli a4, 4, e32,m1,ta,mu
-; RV32-NEXT:    lw a4, 16(a0)
-; RV32-NEXT:    vmv.v.x v26, a3
-; RV32-NEXT:    vmv.s.x v26, a4
+; RV32-NEXT:    vlse32.v v26, (a3), zero
+; RV32-NEXT:    lw a3, 16(a0)
+; RV32-NEXT:    vmv.s.x v26, a3
 ; RV32-NEXT:    vsetivli a3, 4, e64,m2,tu,mu
 ; RV32-NEXT:    vslideup.vi v28, v26, 2
 ; RV32-NEXT:    vsetivli a3, 2, e32,m2,ta,mu