forked from OSchip/llvm-project
[AArch64][GlobalISel] Selection support for vector DUP[X]lane instructions.
In future, we'd like to use the perfect-shuffle mechanism to deal with these shuffle permutations. For now, this improves performance by avoiding the super-expensive const-pool load + tbl instruction. Differential Revision: https://reviews.llvm.org/D84866
This commit is contained in:
parent
ebaa8b1c60
commit
d8ba622209
|
@ -213,5 +213,9 @@ LLT getLCMType(LLT OrigTy, LLT TargetTy);
|
|||
LLVM_READNONE
|
||||
LLT getGCDType(LLT OrigTy, LLT TargetTy);
|
||||
|
||||
/// \returns The splat index of a G_SHUFFLE_VECTOR \p MI when \p MI is a splat.
|
||||
/// If \p MI is not a splat, returns None.
|
||||
Optional<int> getSplatIndex(MachineInstr &MI);
|
||||
|
||||
} // End namespace llvm.
|
||||
#endif
|
||||
|
|
|
@ -604,3 +604,24 @@ LLT llvm::getGCDType(LLT OrigTy, LLT TargetTy) {
|
|||
unsigned GCD = greatestCommonDivisor(OrigSize, TargetSize);
|
||||
return LLT::scalar(GCD);
|
||||
}
|
||||
|
||||
Optional<int> llvm::getSplatIndex(MachineInstr &MI) {
|
||||
assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
|
||||
"Only G_SHUFFLE_VECTOR can have a splat index!");
|
||||
ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
|
||||
auto FirstDefinedIdx = find_if(Mask, [](int Elt) { return Elt >= 0; });
|
||||
|
||||
// If all elements are undefined, this shuffle can be considered a splat.
|
||||
// Return 0 for better potential for callers to simplify.
|
||||
if (FirstDefinedIdx == Mask.end())
|
||||
return 0;
|
||||
|
||||
// Make sure all remaining elements are either undef or the same
|
||||
// as the first non-undef value.
|
||||
int SplatValue = *FirstDefinedIdx;
|
||||
if (any_of(make_range(std::next(FirstDefinedIdx), Mask.end()),
|
||||
[&SplatValue](int Elt) { return Elt >= 0 && Elt != SplatValue; }))
|
||||
return None;
|
||||
|
||||
return SplatValue;
|
||||
}
|
||||
|
|
|
@ -133,6 +133,8 @@ private:
|
|||
bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
|
||||
bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
|
||||
|
||||
bool tryOptShuffleDupLane(MachineInstr &I, LLT DstTy, LLT SrcTy,
|
||||
ArrayRef<int> Mask, MachineRegisterInfo &MRI) const;
|
||||
bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI) const;
|
||||
bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI) const;
|
||||
bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI) const;
|
||||
|
@ -4306,6 +4308,67 @@ MachineInstr *AArch64InstructionSelector::tryOptArithShiftedCompare(
|
|||
return &*CmpMI;
|
||||
}
|
||||
|
||||
bool AArch64InstructionSelector::tryOptShuffleDupLane(
|
||||
MachineInstr &I, LLT DstTy, LLT SrcTy, ArrayRef<int> Mask,
|
||||
MachineRegisterInfo &MRI) const {
|
||||
assert(I.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
|
||||
|
||||
// We assume that scalar->vector splats have been been handled in the
|
||||
// post-legalizer combiner to G_DUP. However splats of a source vector's
|
||||
// lane don't fit that pattern, detect it here:
|
||||
// %res = G_SHUFFLE_VECTOR %src:<n x ty>, undef, <n x i32> splat(lane-idx)
|
||||
// =>
|
||||
// %res = DUPv[N][Ty]lane %src, lane-idx
|
||||
// FIXME: this case should be covered by re-implementing the perfect shuffle
|
||||
// codegen mechanism.
|
||||
|
||||
auto LaneIdx = getSplatIndex(I);
|
||||
if (!LaneIdx)
|
||||
return false;
|
||||
|
||||
// The lane idx should be within the first source vector.
|
||||
if (*LaneIdx >= SrcTy.getNumElements())
|
||||
return false;
|
||||
|
||||
if (DstTy != SrcTy)
|
||||
return false;
|
||||
|
||||
LLT ScalarTy = SrcTy.getElementType();
|
||||
unsigned ScalarSize = ScalarTy.getSizeInBits();
|
||||
|
||||
unsigned Opc = 0;
|
||||
switch (SrcTy.getNumElements()) {
|
||||
case 2:
|
||||
if (ScalarSize == 64)
|
||||
Opc = AArch64::DUPv2i64lane;
|
||||
break;
|
||||
case 4:
|
||||
if (ScalarSize == 32)
|
||||
Opc = AArch64::DUPv4i32lane;
|
||||
break;
|
||||
case 8:
|
||||
if (ScalarSize == 16)
|
||||
Opc = AArch64::DUPv8i16lane;
|
||||
break;
|
||||
case 16:
|
||||
if (ScalarSize == 8)
|
||||
Opc = AArch64::DUPv16i8lane;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (!Opc)
|
||||
return false;
|
||||
|
||||
MachineIRBuilder MIB(I);
|
||||
auto Dup = MIB.buildInstr(Opc, {I.getOperand(0).getReg()},
|
||||
{I.getOperand(1).getReg()})
|
||||
.addImm(*LaneIdx);
|
||||
constrainSelectedInstRegOperands(*Dup, TII, TRI, RBI);
|
||||
I.eraseFromParent();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AArch64InstructionSelector::selectShuffleVector(
|
||||
MachineInstr &I, MachineRegisterInfo &MRI) const {
|
||||
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
|
||||
|
@ -4327,6 +4390,9 @@ bool AArch64InstructionSelector::selectShuffleVector(
|
|||
return false;
|
||||
}
|
||||
|
||||
if (tryOptShuffleDupLane(I, DstTy, Src1Ty, Mask, MRI))
|
||||
return true;
|
||||
|
||||
unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
|
||||
|
||||
SmallVector<Constant *, 64> CstIdxs;
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
|
||||
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
|
||||
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
|
||||
#include "llvm/CodeGen/GlobalISel/Utils.h"
|
||||
#include "llvm/CodeGen/MachineDominators.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/CodeGen/TargetPassConfig.h"
|
||||
|
@ -43,29 +44,6 @@ struct ShuffleVectorPseudo {
|
|||
ShuffleVectorPseudo() {}
|
||||
};
|
||||
|
||||
/// \returns The splat index of a G_SHUFFLE_VECTOR \p MI when \p MI is a splat.
|
||||
/// If \p MI is not a splat, returns None.
|
||||
static Optional<int> getSplatIndex(MachineInstr &MI) {
|
||||
assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
|
||||
"Only G_SHUFFLE_VECTOR can have a splat index!");
|
||||
ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
|
||||
auto FirstDefinedIdx = find_if(Mask, [](int Elt) { return Elt >= 0; });
|
||||
|
||||
// If all elements are undefined, this shuffle can be considered a splat.
|
||||
// Return 0 for better potential for callers to simplify.
|
||||
if (FirstDefinedIdx == Mask.end())
|
||||
return 0;
|
||||
|
||||
// Make sure all remaining elements are either undef or the same
|
||||
// as the first non-undef value.
|
||||
int SplatValue = *FirstDefinedIdx;
|
||||
if (any_of(make_range(std::next(FirstDefinedIdx), Mask.end()),
|
||||
[&SplatValue](int Elt) { return Elt >= 0 && Elt != SplatValue; }))
|
||||
return None;
|
||||
|
||||
return SplatValue;
|
||||
}
|
||||
|
||||
/// Check if a vector shuffle corresponds to a REV instruction with the
|
||||
/// specified blocksize.
|
||||
static bool isREVMask(ArrayRef<int> M, unsigned EltSize, unsigned NumElts,
|
||||
|
|
|
@ -0,0 +1,103 @@
|
|||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
# RUN: llc -mtriple=aarch64 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
|
||||
...
|
||||
---
|
||||
name: duplane_v16i8
|
||||
alignment: 4
|
||||
legalized: true
|
||||
regBankSelected: true
|
||||
tracksRegLiveness: true
|
||||
liveins:
|
||||
- { reg: '$q0' }
|
||||
body: |
|
||||
bb.1:
|
||||
liveins: $q0
|
||||
|
||||
; CHECK-LABEL: name: duplane_v16i8
|
||||
; CHECK: liveins: $q0
|
||||
; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
|
||||
; CHECK: [[DUPv16i8lane:%[0-9]+]]:fpr128 = DUPv16i8lane [[COPY]], 0
|
||||
; CHECK: $q0 = COPY [[DUPv16i8lane]]
|
||||
; CHECK: RET_ReallyLR implicit $q0
|
||||
%0:fpr(<16 x s8>) = COPY $q0
|
||||
%2:fpr(<16 x s8>) = G_IMPLICIT_DEF
|
||||
%1:fpr(<16 x s8>) = G_SHUFFLE_VECTOR %0(<16 x s8>), %2, shufflemask(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||
$q0 = COPY %1(<16 x s8>)
|
||||
RET_ReallyLR implicit $q0
|
||||
|
||||
...
|
||||
---
|
||||
name: duplane_v8i16
|
||||
alignment: 4
|
||||
legalized: true
|
||||
regBankSelected: true
|
||||
tracksRegLiveness: true
|
||||
liveins:
|
||||
- { reg: '$q0' }
|
||||
body: |
|
||||
bb.1:
|
||||
liveins: $q0
|
||||
|
||||
; CHECK-LABEL: name: duplane_v8i16
|
||||
; CHECK: liveins: $q0
|
||||
; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
|
||||
; CHECK: [[DUPv8i16lane:%[0-9]+]]:fpr128 = DUPv8i16lane [[COPY]], 0
|
||||
; CHECK: $q0 = COPY [[DUPv8i16lane]]
|
||||
; CHECK: RET_ReallyLR implicit $q0
|
||||
%0:fpr(<8 x s16>) = COPY $q0
|
||||
%2:fpr(<8 x s16>) = G_IMPLICIT_DEF
|
||||
%1:fpr(<8 x s16>) = G_SHUFFLE_VECTOR %0(<8 x s16>), %2, shufflemask(0, 0, 0, 0, 0, 0, 0, 0)
|
||||
$q0 = COPY %1(<8 x s16>)
|
||||
RET_ReallyLR implicit $q0
|
||||
|
||||
...
|
||||
---
|
||||
name: duplane_v4f32
|
||||
alignment: 4
|
||||
legalized: true
|
||||
regBankSelected: true
|
||||
tracksRegLiveness: true
|
||||
liveins:
|
||||
- { reg: '$q0' }
|
||||
body: |
|
||||
bb.1:
|
||||
liveins: $q0
|
||||
|
||||
; CHECK-LABEL: name: duplane_v4f32
|
||||
; CHECK: liveins: $q0
|
||||
; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
|
||||
; CHECK: [[DUPv4i32lane:%[0-9]+]]:fpr128 = DUPv4i32lane [[COPY]], 0
|
||||
; CHECK: $q0 = COPY [[DUPv4i32lane]]
|
||||
; CHECK: RET_ReallyLR implicit $q0
|
||||
%0:fpr(<4 x s32>) = COPY $q0
|
||||
%2:fpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
%1:fpr(<4 x s32>) = G_SHUFFLE_VECTOR %0(<4 x s32>), %2, shufflemask(0, 0, 0, 0)
|
||||
$q0 = COPY %1(<4 x s32>)
|
||||
RET_ReallyLR implicit $q0
|
||||
|
||||
...
|
||||
---
|
||||
name: duplane_v2i64
|
||||
alignment: 4
|
||||
legalized: true
|
||||
regBankSelected: true
|
||||
tracksRegLiveness: true
|
||||
liveins:
|
||||
- { reg: '$q0' }
|
||||
body: |
|
||||
bb.1:
|
||||
liveins: $q0
|
||||
|
||||
; CHECK-LABEL: name: duplane_v2i64
|
||||
; CHECK: liveins: $q0
|
||||
; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
|
||||
; CHECK: [[DUPv2i64lane:%[0-9]+]]:fpr128 = DUPv2i64lane [[COPY]], 0
|
||||
; CHECK: $q0 = COPY [[DUPv2i64lane]]
|
||||
; CHECK: RET_ReallyLR implicit $q0
|
||||
%0:fpr(<2 x s64>) = COPY $q0
|
||||
%2:fpr(<2 x s64>) = G_IMPLICIT_DEF
|
||||
%1:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %0(<2 x s64>), %2, shufflemask(0, 0)
|
||||
$q0 = COPY %1(<2 x s64>)
|
||||
RET_ReallyLR implicit $q0
|
||||
|
||||
...
|
|
@ -140,7 +140,7 @@ body: |
|
|||
|
||||
; CHECK-LABEL: name: shuffle_v2i64
|
||||
; CHECK: constants:
|
||||
; CHECK: value: '<16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>'
|
||||
; CHECK: value: '<16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>'
|
||||
; CHECK: alignment: 16
|
||||
; CHECK: isTargetSpecific: false
|
||||
; CHECK: liveins: $q0, $q1
|
||||
|
@ -154,7 +154,7 @@ body: |
|
|||
; CHECK: RET_ReallyLR implicit $q0
|
||||
%0:fpr(<2 x s64>) = COPY $q0
|
||||
%1:fpr(<2 x s64>) = COPY $q1
|
||||
%2:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %0(<2 x s64>), %1, shufflemask(0, 0)
|
||||
%2:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %0(<2 x s64>), %1, shufflemask(1, 0)
|
||||
$q0 = COPY %2(<2 x s64>)
|
||||
RET_ReallyLR implicit $q0
|
||||
|
||||
|
|
Loading…
Reference in New Issue