[AArch64][GlobalISel] Selection support for vector DUP[X]lane instructions.

In future, we'd like to use the perfect-shuffle mechanism to deal with these
shuffle permutations. For now, this improves performance by avoiding the
super-expensive const-pool load + tbl instruction.

Differential Revision: https://reviews.llvm.org/D84866
This commit is contained in:
Amara Emerson 2020-07-29 00:21:15 -07:00
parent ebaa8b1c60
commit d8ba622209
6 changed files with 197 additions and 25 deletions

View File

@ -213,5 +213,9 @@ LLT getLCMType(LLT OrigTy, LLT TargetTy);
LLVM_READNONE
LLT getGCDType(LLT OrigTy, LLT TargetTy);
/// \returns The splat index of a G_SHUFFLE_VECTOR \p MI when \p MI is a splat.
/// If \p MI is not a splat, returns None.
Optional<int> getSplatIndex(MachineInstr &MI);
} // End namespace llvm.
#endif

View File

@ -604,3 +604,24 @@ LLT llvm::getGCDType(LLT OrigTy, LLT TargetTy) {
unsigned GCD = greatestCommonDivisor(OrigSize, TargetSize);
return LLT::scalar(GCD);
}
Optional<int> llvm::getSplatIndex(MachineInstr &MI) {
assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
"Only G_SHUFFLE_VECTOR can have a splat index!");
ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
auto FirstDefinedIdx = find_if(Mask, [](int Elt) { return Elt >= 0; });
// If all elements are undefined, this shuffle can be considered a splat.
// Return 0 for better potential for callers to simplify.
if (FirstDefinedIdx == Mask.end())
return 0;
// Make sure all remaining elements are either undef or the same
// as the first non-undef value.
int SplatValue = *FirstDefinedIdx;
if (any_of(make_range(std::next(FirstDefinedIdx), Mask.end()),
[&SplatValue](int Elt) { return Elt >= 0 && Elt != SplatValue; }))
return None;
return SplatValue;
}

View File

@ -133,6 +133,8 @@ private:
bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool tryOptShuffleDupLane(MachineInstr &I, LLT DstTy, LLT SrcTy,
ArrayRef<int> Mask, MachineRegisterInfo &MRI) const;
bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI) const;
@ -4306,6 +4308,67 @@ MachineInstr *AArch64InstructionSelector::tryOptArithShiftedCompare(
return &*CmpMI;
}
bool AArch64InstructionSelector::tryOptShuffleDupLane(
MachineInstr &I, LLT DstTy, LLT SrcTy, ArrayRef<int> Mask,
MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
// We assume that scalar->vector splats have been been handled in the
// post-legalizer combiner to G_DUP. However splats of a source vector's
// lane don't fit that pattern, detect it here:
// %res = G_SHUFFLE_VECTOR %src:<n x ty>, undef, <n x i32> splat(lane-idx)
// =>
// %res = DUPv[N][Ty]lane %src, lane-idx
// FIXME: this case should be covered by re-implementing the perfect shuffle
// codegen mechanism.
auto LaneIdx = getSplatIndex(I);
if (!LaneIdx)
return false;
// The lane idx should be within the first source vector.
if (*LaneIdx >= SrcTy.getNumElements())
return false;
if (DstTy != SrcTy)
return false;
LLT ScalarTy = SrcTy.getElementType();
unsigned ScalarSize = ScalarTy.getSizeInBits();
unsigned Opc = 0;
switch (SrcTy.getNumElements()) {
case 2:
if (ScalarSize == 64)
Opc = AArch64::DUPv2i64lane;
break;
case 4:
if (ScalarSize == 32)
Opc = AArch64::DUPv4i32lane;
break;
case 8:
if (ScalarSize == 16)
Opc = AArch64::DUPv8i16lane;
break;
case 16:
if (ScalarSize == 8)
Opc = AArch64::DUPv16i8lane;
break;
default:
break;
}
if (!Opc)
return false;
MachineIRBuilder MIB(I);
auto Dup = MIB.buildInstr(Opc, {I.getOperand(0).getReg()},
{I.getOperand(1).getReg()})
.addImm(*LaneIdx);
constrainSelectedInstRegOperands(*Dup, TII, TRI, RBI);
I.eraseFromParent();
return true;
}
bool AArch64InstructionSelector::selectShuffleVector(
MachineInstr &I, MachineRegisterInfo &MRI) const {
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
@ -4327,6 +4390,9 @@ bool AArch64InstructionSelector::selectShuffleVector(
return false;
}
if (tryOptShuffleDupLane(I, DstTy, Src1Ty, Mask, MRI))
return true;
unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
SmallVector<Constant *, 64> CstIdxs;

View File

@ -20,6 +20,7 @@
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/TargetPassConfig.h"
@ -43,29 +44,6 @@ struct ShuffleVectorPseudo {
ShuffleVectorPseudo() {}
};
/// \returns The splat index of a G_SHUFFLE_VECTOR \p MI when \p MI is a splat.
/// If \p MI is not a splat, returns None.
static Optional<int> getSplatIndex(MachineInstr &MI) {
assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
"Only G_SHUFFLE_VECTOR can have a splat index!");
ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
auto FirstDefinedIdx = find_if(Mask, [](int Elt) { return Elt >= 0; });
// If all elements are undefined, this shuffle can be considered a splat.
// Return 0 for better potential for callers to simplify.
if (FirstDefinedIdx == Mask.end())
return 0;
// Make sure all remaining elements are either undef or the same
// as the first non-undef value.
int SplatValue = *FirstDefinedIdx;
if (any_of(make_range(std::next(FirstDefinedIdx), Mask.end()),
[&SplatValue](int Elt) { return Elt >= 0 && Elt != SplatValue; }))
return None;
return SplatValue;
}
/// Check if a vector shuffle corresponds to a REV instruction with the
/// specified blocksize.
static bool isREVMask(ArrayRef<int> M, unsigned EltSize, unsigned NumElts,

View File

@ -0,0 +1,103 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=aarch64 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
...
---
name: duplane_v16i8
alignment: 4
legalized: true
regBankSelected: true
tracksRegLiveness: true
liveins:
- { reg: '$q0' }
body: |
bb.1:
liveins: $q0
; CHECK-LABEL: name: duplane_v16i8
; CHECK: liveins: $q0
; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
; CHECK: [[DUPv16i8lane:%[0-9]+]]:fpr128 = DUPv16i8lane [[COPY]], 0
; CHECK: $q0 = COPY [[DUPv16i8lane]]
; CHECK: RET_ReallyLR implicit $q0
%0:fpr(<16 x s8>) = COPY $q0
%2:fpr(<16 x s8>) = G_IMPLICIT_DEF
%1:fpr(<16 x s8>) = G_SHUFFLE_VECTOR %0(<16 x s8>), %2, shufflemask(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
$q0 = COPY %1(<16 x s8>)
RET_ReallyLR implicit $q0
...
---
name: duplane_v8i16
alignment: 4
legalized: true
regBankSelected: true
tracksRegLiveness: true
liveins:
- { reg: '$q0' }
body: |
bb.1:
liveins: $q0
; CHECK-LABEL: name: duplane_v8i16
; CHECK: liveins: $q0
; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
; CHECK: [[DUPv8i16lane:%[0-9]+]]:fpr128 = DUPv8i16lane [[COPY]], 0
; CHECK: $q0 = COPY [[DUPv8i16lane]]
; CHECK: RET_ReallyLR implicit $q0
%0:fpr(<8 x s16>) = COPY $q0
%2:fpr(<8 x s16>) = G_IMPLICIT_DEF
%1:fpr(<8 x s16>) = G_SHUFFLE_VECTOR %0(<8 x s16>), %2, shufflemask(0, 0, 0, 0, 0, 0, 0, 0)
$q0 = COPY %1(<8 x s16>)
RET_ReallyLR implicit $q0
...
---
name: duplane_v4f32
alignment: 4
legalized: true
regBankSelected: true
tracksRegLiveness: true
liveins:
- { reg: '$q0' }
body: |
bb.1:
liveins: $q0
; CHECK-LABEL: name: duplane_v4f32
; CHECK: liveins: $q0
; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
; CHECK: [[DUPv4i32lane:%[0-9]+]]:fpr128 = DUPv4i32lane [[COPY]], 0
; CHECK: $q0 = COPY [[DUPv4i32lane]]
; CHECK: RET_ReallyLR implicit $q0
%0:fpr(<4 x s32>) = COPY $q0
%2:fpr(<4 x s32>) = G_IMPLICIT_DEF
%1:fpr(<4 x s32>) = G_SHUFFLE_VECTOR %0(<4 x s32>), %2, shufflemask(0, 0, 0, 0)
$q0 = COPY %1(<4 x s32>)
RET_ReallyLR implicit $q0
...
---
name: duplane_v2i64
alignment: 4
legalized: true
regBankSelected: true
tracksRegLiveness: true
liveins:
- { reg: '$q0' }
body: |
bb.1:
liveins: $q0
; CHECK-LABEL: name: duplane_v2i64
; CHECK: liveins: $q0
; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
; CHECK: [[DUPv2i64lane:%[0-9]+]]:fpr128 = DUPv2i64lane [[COPY]], 0
; CHECK: $q0 = COPY [[DUPv2i64lane]]
; CHECK: RET_ReallyLR implicit $q0
%0:fpr(<2 x s64>) = COPY $q0
%2:fpr(<2 x s64>) = G_IMPLICIT_DEF
%1:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %0(<2 x s64>), %2, shufflemask(0, 0)
$q0 = COPY %1(<2 x s64>)
RET_ReallyLR implicit $q0
...

View File

@ -140,7 +140,7 @@ body: |
; CHECK-LABEL: name: shuffle_v2i64
; CHECK: constants:
; CHECK: value: '<16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>'
; CHECK: value: '<16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>'
; CHECK: alignment: 16
; CHECK: isTargetSpecific: false
; CHECK: liveins: $q0, $q1
@ -154,7 +154,7 @@ body: |
; CHECK: RET_ReallyLR implicit $q0
%0:fpr(<2 x s64>) = COPY $q0
%1:fpr(<2 x s64>) = COPY $q1
%2:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %0(<2 x s64>), %1, shufflemask(0, 0)
%2:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %0(<2 x s64>), %1, shufflemask(1, 0)
$q0 = COPY %2(<2 x s64>)
RET_ReallyLR implicit $q0