[ARM] Add FP handling for MVE lane interleaving

FP16 to FP32 converts can be handled in MVE lane interleaving, much like the sext/zext lowering we do. This expands the pass with fpext and fptrunc handling, and basic fp operations allowing more efficient lowering of fp vectors. Differential Revision: https://reviews.llvm.org/D97292
2021-04-12 15:28:13 +01:00 · 2021-04-12 15:28:13 +01:00 · 6c0a1ed3a9
parent bcbea2ab84
commit 6c0a1ed3a9
2 changed files with 37 additions and 33 deletions
--- a/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp
+++ b/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp
@ -123,17 +123,20 @@ static bool isProfitableToInterleave(SmallSetVector<Instruction *, 4> &Exts,
  //  T=VLDRH.16; A=VMOVNB T; B=VMOVNT T
  // But those VMOVL may be folded into a VMULL.

-  // But expensive extends/truncs are always good to remove.
-  for (auto *E : Exts)
-    if (!isa<LoadInst>(E->getOperand(0))) {
+  // But expensive extends/truncs are always good to remove. FPExts always
+  // involve extra VCVT's so are always considered to be beneficial to convert.
+  for (auto *E : Exts) {
+    if (isa<FPExtInst>(E) || !isa<LoadInst>(E->getOperand(0))) {
      LLVM_DEBUG(dbgs() << "Beneficial due to " << *E << "\n");
      return true;
    }
-  for (auto *T : Truncs)
+  }
+  for (auto *T : Truncs) {
    if (T->hasOneUse() && !isa<StoreInst>(*T->user_begin())) {
      LLVM_DEBUG(dbgs() << "Beneficial due to " << *T << "\n");
      return true;
    }
+  }

  // Otherwise, we know we have a load(ext), see if any of the Extends are a
  // vmull. This is a simple heuristic and certainly not perfect.
@ -172,6 +175,7 @@ static bool tryInterleave(Instruction *Start,
    switch (I->getOpcode()) {
    // Truncs
    case Instruction::Trunc:
+    case Instruction::FPTrunc:
      if (Truncs.count(I))
        continue;
      Truncs.insert(I);
@ -181,6 +185,7 @@ static bool tryInterleave(Instruction *Start,
    // Extend leafs
    case Instruction::SExt:
    case Instruction::ZExt:
+    case Instruction::FPExt:
      if (Exts.count(I))
        continue;
      for (auto *Use : I->users())
@ -196,6 +201,9 @@ static bool tryInterleave(Instruction *Start,
    case Instruction::LShr:
    case Instruction::Shl:
    case Instruction::ICmp:
+    case Instruction::FCmp:
+    case Instruction::FAdd:
+    case Instruction::FMul:
    case Instruction::Select:
      if (Ops.count(I))
        continue;
@ -297,9 +305,11 @@ static bool tryInterleave(Instruction *Start,
    LLVM_DEBUG(dbgs() << "Replacing ext " << *I << "\n");
    Builder.SetInsertPoint(I);
    Value *Shuffle = Builder.CreateShuffleVector(I->getOperand(0), LeafMask);
+    bool FPext = isa<FPExtInst>(I);
    bool Sext = isa<SExtInst>(I);
-    Value *Ext = Sext ? Builder.CreateSExt(Shuffle, I->getType())
-                      : Builder.CreateZExt(Shuffle, I->getType());
+    Value *Ext = FPext ? Builder.CreateFPExt(Shuffle, I->getType())
+                       : Sext ? Builder.CreateSExt(Shuffle, I->getType())
+                              : Builder.CreateZExt(Shuffle, I->getType());
    I->replaceAllUsesWith(Ext);
    LLVM_DEBUG(dbgs() << "  with " << *Shuffle << "\n");
  }
--- a/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll
@ -360,16 +360,14 @@ define void @both_8(half* nocapture readonly %x, half* noalias nocapture %y) {
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
 ; CHECK-NEXT:  .LBB7_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrh.u32 q1, [r0, #8]
-; CHECK-NEXT:    vcvtb.f32.f16 q1, q1
+; CHECK-NEXT:    vldrh.u16 q1, [r0], #16
+; CHECK-NEXT:    vcvtb.f32.f16 q2, q1
+; CHECK-NEXT:    vcvtt.f32.f16 q1, q1
+; CHECK-NEXT:    vmul.f32 q2, q2, q0
 ; CHECK-NEXT:    vmul.f32 q1, q1, q0
-; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
-; CHECK-NEXT:    vstrh.32 q1, [r1, #8]
-; CHECK-NEXT:    vldrh.u32 q1, [r0], #16
-; CHECK-NEXT:    vcvtb.f32.f16 q1, q1
-; CHECK-NEXT:    vmul.f32 q1, q1, q0
-; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
-; CHECK-NEXT:    vstrh.32 q1, [r1], #16
+; CHECK-NEXT:    vcvtb.f16.f32 q2, q2
+; CHECK-NEXT:    vcvtt.f16.f32 q2, q1
+; CHECK-NEXT:    vstrb.8 q2, [r1], #16
 ; CHECK-NEXT:    le lr, .LBB7_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
@ -412,26 +410,22 @@ define void @both_16(half* nocapture readonly %x, half* noalias nocapture %y) {
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
 ; CHECK-NEXT:  .LBB8_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrh.u32 q1, [r0, #24]
-; CHECK-NEXT:    vcvtb.f32.f16 q1, q1
+; CHECK-NEXT:    vldrh.u16 q1, [r0, #16]
+; CHECK-NEXT:    vcvtb.f32.f16 q2, q1
+; CHECK-NEXT:    vcvtt.f32.f16 q1, q1
+; CHECK-NEXT:    vmul.f32 q2, q2, q0
 ; CHECK-NEXT:    vmul.f32 q1, q1, q0
-; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
-; CHECK-NEXT:    vstrh.32 q1, [r1, #24]
-; CHECK-NEXT:    vldrh.u32 q1, [r0, #16]
-; CHECK-NEXT:    vcvtb.f32.f16 q1, q1
+; CHECK-NEXT:    vcvtb.f16.f32 q2, q2
+; CHECK-NEXT:    vcvtt.f16.f32 q2, q1
+; CHECK-NEXT:    vldrh.u16 q1, [r0], #32
+; CHECK-NEXT:    vstrh.16 q2, [r1, #16]
+; CHECK-NEXT:    vcvtb.f32.f16 q2, q1
+; CHECK-NEXT:    vcvtt.f32.f16 q1, q1
+; CHECK-NEXT:    vmul.f32 q2, q2, q0
 ; CHECK-NEXT:    vmul.f32 q1, q1, q0
-; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
-; CHECK-NEXT:    vstrh.32 q1, [r1, #16]
-; CHECK-NEXT:    vldrh.u32 q1, [r0, #8]
-; CHECK-NEXT:    vcvtb.f32.f16 q1, q1
-; CHECK-NEXT:    vmul.f32 q1, q1, q0
-; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
-; CHECK-NEXT:    vstrh.32 q1, [r1, #8]
-; CHECK-NEXT:    vldrh.u32 q1, [r0], #32
-; CHECK-NEXT:    vcvtb.f32.f16 q1, q1
-; CHECK-NEXT:    vmul.f32 q1, q1, q0
-; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
-; CHECK-NEXT:    vstrh.32 q1, [r1], #32
+; CHECK-NEXT:    vcvtb.f16.f32 q2, q2
+; CHECK-NEXT:    vcvtt.f16.f32 q2, q1
+; CHECK-NEXT:    vstrh.16 q2, [r1], #32
 ; CHECK-NEXT:    le lr, .LBB8_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}