forked from OSchip/llvm-project
[ARM] Add FP handling for MVE lane interleaving
FP16 to FP32 converts can be handled in MVE lane interleaving, much like the sext/zext lowering we do. This expands the pass with fpext and fptrunc handling, and basic fp operations allowing more efficient lowering of fp vectors. Differential Revision: https://reviews.llvm.org/D97292
This commit is contained in:
parent
bcbea2ab84
commit
6c0a1ed3a9
|
@ -123,17 +123,20 @@ static bool isProfitableToInterleave(SmallSetVector<Instruction *, 4> &Exts,
|
||||||
// T=VLDRH.16; A=VMOVNB T; B=VMOVNT T
|
// T=VLDRH.16; A=VMOVNB T; B=VMOVNT T
|
||||||
// But those VMOVL may be folded into a VMULL.
|
// But those VMOVL may be folded into a VMULL.
|
||||||
|
|
||||||
// But expensive extends/truncs are always good to remove.
|
// But expensive extends/truncs are always good to remove. FPExts always
|
||||||
for (auto *E : Exts)
|
// involve extra VCVT's so are always considered to be beneficial to convert.
|
||||||
if (!isa<LoadInst>(E->getOperand(0))) {
|
for (auto *E : Exts) {
|
||||||
|
if (isa<FPExtInst>(E) || !isa<LoadInst>(E->getOperand(0))) {
|
||||||
LLVM_DEBUG(dbgs() << "Beneficial due to " << *E << "\n");
|
LLVM_DEBUG(dbgs() << "Beneficial due to " << *E << "\n");
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
for (auto *T : Truncs)
|
}
|
||||||
|
for (auto *T : Truncs) {
|
||||||
if (T->hasOneUse() && !isa<StoreInst>(*T->user_begin())) {
|
if (T->hasOneUse() && !isa<StoreInst>(*T->user_begin())) {
|
||||||
LLVM_DEBUG(dbgs() << "Beneficial due to " << *T << "\n");
|
LLVM_DEBUG(dbgs() << "Beneficial due to " << *T << "\n");
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Otherwise, we know we have a load(ext), see if any of the Extends are a
|
// Otherwise, we know we have a load(ext), see if any of the Extends are a
|
||||||
// vmull. This is a simple heuristic and certainly not perfect.
|
// vmull. This is a simple heuristic and certainly not perfect.
|
||||||
|
@ -172,6 +175,7 @@ static bool tryInterleave(Instruction *Start,
|
||||||
switch (I->getOpcode()) {
|
switch (I->getOpcode()) {
|
||||||
// Truncs
|
// Truncs
|
||||||
case Instruction::Trunc:
|
case Instruction::Trunc:
|
||||||
|
case Instruction::FPTrunc:
|
||||||
if (Truncs.count(I))
|
if (Truncs.count(I))
|
||||||
continue;
|
continue;
|
||||||
Truncs.insert(I);
|
Truncs.insert(I);
|
||||||
|
@ -181,6 +185,7 @@ static bool tryInterleave(Instruction *Start,
|
||||||
// Extend leafs
|
// Extend leafs
|
||||||
case Instruction::SExt:
|
case Instruction::SExt:
|
||||||
case Instruction::ZExt:
|
case Instruction::ZExt:
|
||||||
|
case Instruction::FPExt:
|
||||||
if (Exts.count(I))
|
if (Exts.count(I))
|
||||||
continue;
|
continue;
|
||||||
for (auto *Use : I->users())
|
for (auto *Use : I->users())
|
||||||
|
@ -196,6 +201,9 @@ static bool tryInterleave(Instruction *Start,
|
||||||
case Instruction::LShr:
|
case Instruction::LShr:
|
||||||
case Instruction::Shl:
|
case Instruction::Shl:
|
||||||
case Instruction::ICmp:
|
case Instruction::ICmp:
|
||||||
|
case Instruction::FCmp:
|
||||||
|
case Instruction::FAdd:
|
||||||
|
case Instruction::FMul:
|
||||||
case Instruction::Select:
|
case Instruction::Select:
|
||||||
if (Ops.count(I))
|
if (Ops.count(I))
|
||||||
continue;
|
continue;
|
||||||
|
@ -297,8 +305,10 @@ static bool tryInterleave(Instruction *Start,
|
||||||
LLVM_DEBUG(dbgs() << "Replacing ext " << *I << "\n");
|
LLVM_DEBUG(dbgs() << "Replacing ext " << *I << "\n");
|
||||||
Builder.SetInsertPoint(I);
|
Builder.SetInsertPoint(I);
|
||||||
Value *Shuffle = Builder.CreateShuffleVector(I->getOperand(0), LeafMask);
|
Value *Shuffle = Builder.CreateShuffleVector(I->getOperand(0), LeafMask);
|
||||||
|
bool FPext = isa<FPExtInst>(I);
|
||||||
bool Sext = isa<SExtInst>(I);
|
bool Sext = isa<SExtInst>(I);
|
||||||
Value *Ext = Sext ? Builder.CreateSExt(Shuffle, I->getType())
|
Value *Ext = FPext ? Builder.CreateFPExt(Shuffle, I->getType())
|
||||||
|
: Sext ? Builder.CreateSExt(Shuffle, I->getType())
|
||||||
: Builder.CreateZExt(Shuffle, I->getType());
|
: Builder.CreateZExt(Shuffle, I->getType());
|
||||||
I->replaceAllUsesWith(Ext);
|
I->replaceAllUsesWith(Ext);
|
||||||
LLVM_DEBUG(dbgs() << " with " << *Shuffle << "\n");
|
LLVM_DEBUG(dbgs() << " with " << *Shuffle << "\n");
|
||||||
|
|
|
@ -360,16 +360,14 @@ define void @both_8(half* nocapture readonly %x, half* noalias nocapture %y) {
|
||||||
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||||||
; CHECK-NEXT: .LBB7_1: @ %vector.body
|
; CHECK-NEXT: .LBB7_1: @ %vector.body
|
||||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||||
; CHECK-NEXT: vldrh.u32 q1, [r0, #8]
|
; CHECK-NEXT: vldrh.u16 q1, [r0], #16
|
||||||
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
|
; CHECK-NEXT: vcvtb.f32.f16 q2, q1
|
||||||
|
; CHECK-NEXT: vcvtt.f32.f16 q1, q1
|
||||||
|
; CHECK-NEXT: vmul.f32 q2, q2, q0
|
||||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
; CHECK-NEXT: vcvtb.f16.f32 q2, q2
|
||||||
; CHECK-NEXT: vstrh.32 q1, [r1, #8]
|
; CHECK-NEXT: vcvtt.f16.f32 q2, q1
|
||||||
; CHECK-NEXT: vldrh.u32 q1, [r0], #16
|
; CHECK-NEXT: vstrb.8 q2, [r1], #16
|
||||||
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
|
|
||||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
|
||||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
|
||||||
; CHECK-NEXT: vstrh.32 q1, [r1], #16
|
|
||||||
; CHECK-NEXT: le lr, .LBB7_1
|
; CHECK-NEXT: le lr, .LBB7_1
|
||||||
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
||||||
; CHECK-NEXT: pop {r7, pc}
|
; CHECK-NEXT: pop {r7, pc}
|
||||||
|
@ -412,26 +410,22 @@ define void @both_16(half* nocapture readonly %x, half* noalias nocapture %y) {
|
||||||
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||||||
; CHECK-NEXT: .LBB8_1: @ %vector.body
|
; CHECK-NEXT: .LBB8_1: @ %vector.body
|
||||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||||
; CHECK-NEXT: vldrh.u32 q1, [r0, #24]
|
; CHECK-NEXT: vldrh.u16 q1, [r0, #16]
|
||||||
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
|
; CHECK-NEXT: vcvtb.f32.f16 q2, q1
|
||||||
|
; CHECK-NEXT: vcvtt.f32.f16 q1, q1
|
||||||
|
; CHECK-NEXT: vmul.f32 q2, q2, q0
|
||||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
; CHECK-NEXT: vcvtb.f16.f32 q2, q2
|
||||||
; CHECK-NEXT: vstrh.32 q1, [r1, #24]
|
; CHECK-NEXT: vcvtt.f16.f32 q2, q1
|
||||||
; CHECK-NEXT: vldrh.u32 q1, [r0, #16]
|
; CHECK-NEXT: vldrh.u16 q1, [r0], #32
|
||||||
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
|
; CHECK-NEXT: vstrh.16 q2, [r1, #16]
|
||||||
|
; CHECK-NEXT: vcvtb.f32.f16 q2, q1
|
||||||
|
; CHECK-NEXT: vcvtt.f32.f16 q1, q1
|
||||||
|
; CHECK-NEXT: vmul.f32 q2, q2, q0
|
||||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
||||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
; CHECK-NEXT: vcvtb.f16.f32 q2, q2
|
||||||
; CHECK-NEXT: vstrh.32 q1, [r1, #16]
|
; CHECK-NEXT: vcvtt.f16.f32 q2, q1
|
||||||
; CHECK-NEXT: vldrh.u32 q1, [r0, #8]
|
; CHECK-NEXT: vstrh.16 q2, [r1], #32
|
||||||
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
|
|
||||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
|
||||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
|
||||||
; CHECK-NEXT: vstrh.32 q1, [r1, #8]
|
|
||||||
; CHECK-NEXT: vldrh.u32 q1, [r0], #32
|
|
||||||
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
|
|
||||||
; CHECK-NEXT: vmul.f32 q1, q1, q0
|
|
||||||
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
|
|
||||||
; CHECK-NEXT: vstrh.32 q1, [r1], #32
|
|
||||||
; CHECK-NEXT: le lr, .LBB8_1
|
; CHECK-NEXT: le lr, .LBB8_1
|
||||||
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
||||||
; CHECK-NEXT: pop {r7, pc}
|
; CHECK-NEXT: pop {r7, pc}
|
||||||
|
|
Loading…
Reference in New Issue