forked from OSchip/llvm-project
[AArch64][GlobalISel] Add support for 64 bit vector shuffle using TBL1.
This extends the existing support for shufflevector to handle cases like <2 x float>, which we can implement by concating the vectors and using a TBL1. Differential Revision: https://reviews.llvm.org/D58684 llvm-svn: 355104
This commit is contained in:
parent
5d4d168c3c
commit
85c3afd7f6
|
@ -82,6 +82,8 @@ private:
|
|||
unsigned emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const;
|
||||
MachineInstr *emitLoadFromConstantPool(Constant *CPVal,
|
||||
MachineIRBuilder &MIRBuilder) const;
|
||||
MachineInstr *emitVectorConcat(unsigned Op1, unsigned Op2,
|
||||
MachineIRBuilder &MIRBuilder) const;
|
||||
|
||||
ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
|
||||
|
||||
|
@ -1965,6 +1967,98 @@ MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
|
|||
return &*Load;
|
||||
}
|
||||
|
||||
/// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
|
||||
/// size and RB.
|
||||
static std::pair<unsigned, unsigned>
|
||||
getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
|
||||
unsigned Opc, SubregIdx;
|
||||
if (RB.getID() == AArch64::GPRRegBankID) {
|
||||
if (EltSize == 32) {
|
||||
Opc = AArch64::INSvi32gpr;
|
||||
SubregIdx = AArch64::ssub;
|
||||
} else if (EltSize == 64) {
|
||||
Opc = AArch64::INSvi64gpr;
|
||||
SubregIdx = AArch64::dsub;
|
||||
} else {
|
||||
llvm_unreachable("invalid elt size!");
|
||||
}
|
||||
} else {
|
||||
if (EltSize == 8) {
|
||||
Opc = AArch64::INSvi8lane;
|
||||
SubregIdx = AArch64::bsub;
|
||||
} else if (EltSize == 16) {
|
||||
Opc = AArch64::INSvi16lane;
|
||||
SubregIdx = AArch64::hsub;
|
||||
} else if (EltSize == 32) {
|
||||
Opc = AArch64::INSvi32lane;
|
||||
SubregIdx = AArch64::ssub;
|
||||
} else if (EltSize == 64) {
|
||||
Opc = AArch64::INSvi64lane;
|
||||
SubregIdx = AArch64::dsub;
|
||||
} else {
|
||||
llvm_unreachable("invalid elt size!");
|
||||
}
|
||||
}
|
||||
return std::make_pair(Opc, SubregIdx);
|
||||
}
|
||||
|
||||
MachineInstr *AArch64InstructionSelector::emitVectorConcat(
|
||||
unsigned Op1, unsigned Op2, MachineIRBuilder &MIRBuilder) const {
|
||||
// We implement a vector concat by:
|
||||
// 1. Use scalar_to_vector to insert the lower vector into the larger dest
|
||||
// 2. Insert the upper vector into the destination's upper element
|
||||
// TODO: some of this code is common with G_BUILD_VECTOR handling.
|
||||
MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
|
||||
|
||||
const LLT Op1Ty = MRI.getType(Op1);
|
||||
const LLT Op2Ty = MRI.getType(Op2);
|
||||
|
||||
if (Op1Ty != Op2Ty) {
|
||||
LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
|
||||
return nullptr;
|
||||
}
|
||||
assert(Op1Ty.isVector() && "Expected a vector for vector concat");
|
||||
|
||||
if (Op1Ty.getSizeInBits() >= 128) {
|
||||
LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// At the moment we just support 64 bit vector concats.
|
||||
if (Op1Ty.getSizeInBits() != 64) {
|
||||
LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
|
||||
const LLT &DstTy = LLT::vector(2, ScalarTy);
|
||||
const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
|
||||
const TargetRegisterClass *DstRC =
|
||||
getMinClassForRegBank(FPRBank, Op1Ty.getSizeInBits() * 2);
|
||||
|
||||
MachineInstr *WidenedOp1 = emitScalarToVector(DstTy, DstRC, Op1, MIRBuilder);
|
||||
MachineInstr *WidenedOp2 = emitScalarToVector(DstTy, DstRC, Op2, MIRBuilder);
|
||||
if (!WidenedOp1 || !WidenedOp2) {
|
||||
LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Now do the insert of the upper element.
|
||||
unsigned InsertOpc, InsSubRegIdx;
|
||||
std::tie(InsertOpc, InsSubRegIdx) =
|
||||
getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits());
|
||||
|
||||
auto InsElt =
|
||||
MIRBuilder
|
||||
.buildInstr(InsertOpc, {DstRC}, {WidenedOp1->getOperand(0).getReg()})
|
||||
.addImm(1) /* Lane index */
|
||||
.addUse(WidenedOp2->getOperand(0).getReg())
|
||||
.addImm(0);
|
||||
|
||||
constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
|
||||
return &*InsElt;
|
||||
}
|
||||
|
||||
bool AArch64InstructionSelector::selectShuffleVector(
|
||||
MachineInstr &I, MachineRegisterInfo &MRI) const {
|
||||
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
|
||||
|
@ -2002,21 +2096,37 @@ bool AArch64InstructionSelector::selectShuffleVector(
|
|||
}
|
||||
}
|
||||
|
||||
if (DstTy.getSizeInBits() != 128) {
|
||||
assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
|
||||
// This case can be done with TBL1.
|
||||
return false;
|
||||
}
|
||||
MachineIRBuilder MIRBuilder(I);
|
||||
|
||||
// Use a constant pool to load the index vector for TBL.
|
||||
Constant *CPVal = ConstantVector::get(CstIdxs);
|
||||
MachineIRBuilder MIRBuilder(I);
|
||||
MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder);
|
||||
if (!IndexLoad) {
|
||||
LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (DstTy.getSizeInBits() != 128) {
|
||||
assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
|
||||
// This case can be done with TBL1.
|
||||
MachineInstr *Concat = emitVectorConcat(Src1Reg, Src2Reg, MIRBuilder);
|
||||
if (!Concat) {
|
||||
LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
|
||||
return false;
|
||||
}
|
||||
auto TBL1 = MIRBuilder.buildInstr(
|
||||
AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
|
||||
{Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
|
||||
constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
|
||||
|
||||
auto Copy = BuildMI(*I.getParent(), I, I.getDebugLoc(),
|
||||
TII.get(TargetOpcode::COPY), I.getOperand(0).getReg())
|
||||
.addUse(TBL1->getOperand(0).getReg(), 0, AArch64::dsub);
|
||||
RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
|
||||
I.eraseFromParent();
|
||||
return true;
|
||||
}
|
||||
|
||||
// For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
|
||||
// Q registers for regalloc.
|
||||
auto RegSeq = MIRBuilder
|
||||
|
@ -2048,26 +2158,8 @@ bool AArch64InstructionSelector::selectBuildVector(
|
|||
const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
|
||||
unsigned Opc;
|
||||
unsigned SubregIdx;
|
||||
if (RB.getID() == AArch64::GPRRegBankID) {
|
||||
if (EltSize == 32) {
|
||||
Opc = AArch64::INSvi32gpr;
|
||||
SubregIdx = AArch64::ssub;
|
||||
} else {
|
||||
Opc = AArch64::INSvi64gpr;
|
||||
SubregIdx = AArch64::dsub;
|
||||
}
|
||||
} else {
|
||||
if (EltSize == 16) {
|
||||
Opc = AArch64::INSvi16lane;
|
||||
SubregIdx = AArch64::hsub;
|
||||
} else if (EltSize == 32) {
|
||||
Opc = AArch64::INSvi32lane;
|
||||
SubregIdx = AArch64::ssub;
|
||||
} else {
|
||||
Opc = AArch64::INSvi64lane;
|
||||
SubregIdx = AArch64::dsub;
|
||||
}
|
||||
}
|
||||
|
||||
std::tie(Opc, SubregIdx) = getInsertVecEltOpInfo(RB, EltSize);
|
||||
|
||||
MachineIRBuilder MIRBuilder(I);
|
||||
|
||||
|
|
|
@ -1,11 +1,17 @@
|
|||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
# WARNING: update_mir_test_checks.py does not include the constant pools output,
|
||||
# so this test requires manual fixing up after running the script.
|
||||
|
||||
# RUN: llc -mtriple=aarch64-- -O0 -run-pass=instruction-select -verify-machineinstrs %s -global-isel-abort=1 -o - | FileCheck %s
|
||||
--- |
|
||||
; ModuleID = 'shufflevec-only-legal.ll'
|
||||
source_filename = "shufflevec-only-legal.ll"
|
||||
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
|
||||
target triple = "aarch64"
|
||||
|
||||
define <2 x float> @shuffle_v2f32(<2 x float> %a, <2 x float> %b) {
|
||||
%shuf = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 0>
|
||||
ret <2 x float> %shuf
|
||||
}
|
||||
|
||||
define <4 x i32> @shuffle_v4i32(<4 x i32> %a, <4 x i32> %b) {
|
||||
%shuf = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 3, i32 0>
|
||||
ret <4 x i32> %shuf
|
||||
|
@ -21,6 +27,45 @@
|
|||
ret <2 x i64> %shuf
|
||||
}
|
||||
|
||||
...
|
||||
---
|
||||
name: shuffle_v2f32
|
||||
alignment: 2
|
||||
legalized: true
|
||||
regBankSelected: true
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.1 (%ir-block.0):
|
||||
liveins: $d0, $d1
|
||||
|
||||
; CHECK-LABEL: name: shuffle_v2f32
|
||||
; CHECK: constants:
|
||||
; CHECK: - id: 0
|
||||
; CHECK: value: '<8 x i8> <i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3>'
|
||||
; CHECK: alignment: 8
|
||||
; CHECK: liveins: $d0, $d1
|
||||
; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
|
||||
; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1
|
||||
; CHECK: [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) %const.0
|
||||
; CHECK: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) %const.0
|
||||
; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
|
||||
; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.dsub
|
||||
; CHECK: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
|
||||
; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.dsub
|
||||
; CHECK: [[INSvi64lane:%[0-9]+]]:fpr128 = INSvi64lane [[INSERT_SUBREG]], 1, [[INSERT_SUBREG1]], 0
|
||||
; CHECK: [[TBLv16i8One:%[0-9]+]]:fpr128 = TBLv16i8One [[INSvi64lane]], [[LDRQui]]
|
||||
; CHECK: [[COPY2:%[0-9]+]]:fpr64 = COPY [[TBLv16i8One]].dsub
|
||||
; CHECK: $d0 = COPY [[COPY2]]
|
||||
; CHECK: RET_ReallyLR implicit $d0
|
||||
%0:fpr(<2 x s32>) = COPY $d0
|
||||
%1:fpr(<2 x s32>) = COPY $d1
|
||||
%4:gpr(s32) = G_CONSTANT i32 1
|
||||
%5:gpr(s32) = G_CONSTANT i32 0
|
||||
%3:fpr(<2 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32)
|
||||
%2:fpr(<2 x s32>) = G_SHUFFLE_VECTOR %0(<2 x s32>), %1, %3(<2 x s32>)
|
||||
$d0 = COPY %2(<2 x s32>)
|
||||
RET_ReallyLR implicit $d0
|
||||
|
||||
...
|
||||
---
|
||||
name: shuffle_v4i32
|
||||
|
@ -28,14 +73,6 @@ alignment: 2
|
|||
legalized: true
|
||||
regBankSelected: true
|
||||
tracksRegLiveness: true
|
||||
registers:
|
||||
- { id: 0, class: fpr }
|
||||
- { id: 1, class: fpr }
|
||||
- { id: 2, class: fpr }
|
||||
- { id: 3, class: fpr }
|
||||
- { id: 4, class: gpr }
|
||||
- { id: 5, class: gpr }
|
||||
- { id: 6, class: gpr }
|
||||
body: |
|
||||
bb.1 (%ir-block.0):
|
||||
liveins: $q0, $q1
|
||||
|
@ -71,15 +108,6 @@ alignment: 2
|
|||
legalized: true
|
||||
regBankSelected: true
|
||||
tracksRegLiveness: true
|
||||
registers:
|
||||
- { id: 0, class: fpr }
|
||||
- { id: 1, class: fpr }
|
||||
- { id: 2, class: fpr }
|
||||
- { id: 3, class: fpr }
|
||||
- { id: 4, class: gpr }
|
||||
- { id: 5, class: gpr }
|
||||
- { id: 6, class: gpr }
|
||||
- { id: 7, class: gpr }
|
||||
body: |
|
||||
bb.1 (%ir-block.0):
|
||||
liveins: $q0, $q1
|
||||
|
@ -116,12 +144,6 @@ alignment: 2
|
|||
legalized: true
|
||||
regBankSelected: true
|
||||
tracksRegLiveness: true
|
||||
registers:
|
||||
- { id: 0, class: fpr }
|
||||
- { id: 1, class: fpr }
|
||||
- { id: 2, class: fpr }
|
||||
- { id: 3, class: fpr }
|
||||
- { id: 4, class: gpr }
|
||||
body: |
|
||||
bb.1 (%ir-block.0):
|
||||
liveins: $q0, $q1
|
||||
|
|
Loading…
Reference in New Issue