[AArch64][GlobalISel] Implement partial support for G_SHUFFLE_VECTOR

This change makes some basic type combinations for G_SHUFFLE_VECTOR legal, and
implements them with a very pessimistic TBL2 instruction in the selector.

For TBL2, support is also needed to generate constant pool entries and load from
them in order to materialize the mask register.

Currently supports <2 x s64> and <4 x s32> result types.

Differential Revision: https://reviews.llvm.org/D58466

llvm-svn: 354521
This commit is contained in:
Amara Emerson 2019-02-20 22:11:39 +00:00
parent 55cc7eb5cb
commit a946d057b4
5 changed files with 359 additions and 1 deletions

View File

@ -23,6 +23,7 @@
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@ -75,6 +76,14 @@ private:
bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
void collectShuffleMaskIndices(MachineInstr &I, MachineRegisterInfo &MRI,
SmallVectorImpl<int> &Idxs) const;
bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI) const;
unsigned emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const;
MachineInstr *emitLoadFromConstantPool(Constant *CPVal,
MachineIRBuilder &MIRBuilder) const;
ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
@ -1696,6 +1705,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return selectMergeValues(I, MRI);
case TargetOpcode::G_UNMERGE_VALUES:
return selectUnmergeValues(I, MRI);
case TargetOpcode::G_SHUFFLE_VECTOR:
return selectShuffleVector(I, MRI);
}
return false;
@ -1913,6 +1924,125 @@ bool AArch64InstructionSelector::selectUnmergeValues(
return true;
}
void AArch64InstructionSelector::collectShuffleMaskIndices(
MachineInstr &I, MachineRegisterInfo &MRI,
SmallVectorImpl<int> &Idxs) const {
MachineInstr *MaskDef = MRI.getVRegDef(I.getOperand(3).getReg());
assert(
MaskDef->getOpcode() == TargetOpcode::G_BUILD_VECTOR &&
"G_SHUFFLE_VECTOR should have a constant mask operand as G_BUILD_VECTOR");
// Find the constant indices.
for (unsigned i = 1, e = MaskDef->getNumOperands(); i < e; ++i) {
MachineInstr *ScalarDef = MRI.getVRegDef(MaskDef->getOperand(i).getReg());
assert(ScalarDef && "Could not find vreg def of shufflevec index op");
// Look through copies.
while (ScalarDef->getOpcode() == TargetOpcode::COPY) {
ScalarDef = MRI.getVRegDef(ScalarDef->getOperand(1).getReg());
assert(ScalarDef && "Could not find def of copy operand");
}
assert(ScalarDef->getOpcode() == TargetOpcode::G_CONSTANT);
Idxs.push_back(ScalarDef->getOperand(1).getCImm()->getSExtValue());
}
}
unsigned
AArch64InstructionSelector::emitConstantPoolEntry(Constant *CPVal,
MachineFunction &MF) const {
Type *CPTy = CPVal->getType()->getPointerTo();
unsigned Align = MF.getDataLayout().getPrefTypeAlignment(CPTy);
if (Align == 0)
Align = MF.getDataLayout().getTypeAllocSize(CPTy);
MachineConstantPool *MCP = MF.getConstantPool();
return MCP->getConstantPoolIndex(CPVal, Align);
}
MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
unsigned CPIdx = emitConstantPoolEntry(CPVal, MIRBuilder.getMF());
auto Adrp =
MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
.addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
auto Load =
MIRBuilder.buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp})
.addConstantPoolIndex(CPIdx, 0,
AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
return &*Load;
}
bool AArch64InstructionSelector::selectShuffleVector(
MachineInstr &I, MachineRegisterInfo &MRI) const {
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
unsigned Src1Reg = I.getOperand(1).getReg();
const LLT Src1Ty = MRI.getType(Src1Reg);
unsigned Src2Reg = I.getOperand(2).getReg();
const LLT Src2Ty = MRI.getType(Src2Reg);
MachineBasicBlock &MBB = *I.getParent();
MachineFunction &MF = *MBB.getParent();
LLVMContext &Ctx = MF.getFunction().getContext();
// G_SHUFFLE_VECTOR doesn't really have a strictly enforced constant mask
// operand, it comes in as a normal vector value which we have to analyze to
// find the mask indices.
SmallVector<int, 8> Mask;
collectShuffleMaskIndices(I, MRI, Mask);
assert(!Mask.empty() && "Expected to find mask indices");
// G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
// it's originated from a <1 x T> type. Those should have been lowered into
// G_BUILD_VECTOR earlier.
if (!Src1Ty.isVector() || !Src2Ty.isVector()) {
LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n");
return false;
}
unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
SmallVector<Constant *, 64> CstIdxs;
for (int Val : Mask) {
for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
unsigned Offset = Byte + Val * BytesPerElt;
CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset));
}
}
if (DstTy.getSizeInBits() != 128) {
assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
// This case can be done with TBL1.
return false;
}
// Use a constant pool to load the index vector for TBL.
Constant *CPVal = ConstantVector::get(CstIdxs);
MachineIRBuilder MIRBuilder(I);
MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder);
if (!IndexLoad) {
LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
return false;
}
// For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
// Q registers for regalloc.
auto RegSeq = MIRBuilder
.buildInstr(TargetOpcode::REG_SEQUENCE,
{&AArch64::QQRegClass}, {Src1Reg})
.addImm(AArch64::qsub0)
.addUse(Src2Reg)
.addImm(AArch64::qsub1);
auto TBL2 =
MIRBuilder.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0).getReg()},
{RegSeq, IndexLoad->getOperand(0).getReg()});
constrainSelectedInstRegOperands(*RegSeq, TII, TRI, RBI);
constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI);
I.eraseFromParent();
return true;
}
bool AArch64InstructionSelector::selectBuildVector(
MachineInstr &I, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);

View File

@ -461,6 +461,29 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
{s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
.scalarize(1);
getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
.legalIf([=](const LegalityQuery &Query) {
const LLT &DstTy = Query.Types[0];
const LLT &SrcTy = Query.Types[1];
// For now just support the TBL2 variant which needs the source vectors
// to be the same size as the dest.
if (DstTy != SrcTy)
return false;
ArrayRef<LLT> SupportedDstTys = {v2s32, v4s32, v2s64};
for (auto &Ty : SupportedDstTys) {
if (DstTy == Ty)
return true;
}
return false;
})
// G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we
// just want those lowered into G_BUILD_VECTOR
.lowerIf([=](const LegalityQuery &Query) {
return !Query.Types[1].isVector();
})
.clampNumElements(0, v4s32, v4s32)
.clampNumElements(0, v2s64, v2s64);
computeTables();
verify(*ST.getInstrInfo());
}

View File

@ -0,0 +1,54 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=aarch64 -run-pass=legalizer -global-isel-abort=1 %s -o - | FileCheck %s
---
name: shuffle_v4i32
alignment: 2
tracksRegLiveness: true
body: |
bb.1:
liveins: $q0, $q1
; CHECK-LABEL: name: shuffle_v4i32
; CHECK: liveins: $q0, $q1
; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
; CHECK: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[COPY1]], [[BUILD_VECTOR]](<4 x s32>)
; CHECK: $q0 = COPY [[SHUF]](<4 x s32>)
; CHECK: RET_ReallyLR implicit $q0
%0:_(<4 x s32>) = COPY $q0
%1:_(<4 x s32>) = COPY $q1
%4:_(s32) = G_CONSTANT i32 0
%3:_(<4 x s32>) = G_BUILD_VECTOR %4(s32), %4(s32), %4(s32), %4(s32)
%2:_(<4 x s32>) = G_SHUFFLE_VECTOR %0(<4 x s32>), %1, %3(<4 x s32>)
$q0 = COPY %2(<4 x s32>)
RET_ReallyLR implicit $q0
...
---
name: shuffle_v2i64
alignment: 2
tracksRegLiveness: true
body: |
bb.1:
liveins: $q0, $q1
; CHECK-LABEL: name: shuffle_v2i64
; CHECK: liveins: $q0, $q1
; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32)
; CHECK: [[SHUF:%[0-9]+]]:_(<2 x s64>) = G_SHUFFLE_VECTOR [[COPY]](<2 x s64>), [[COPY1]], [[BUILD_VECTOR]](<2 x s32>)
; CHECK: $q0 = COPY [[SHUF]](<2 x s64>)
; CHECK: RET_ReallyLR implicit $q0
%0:_(<2 x s64>) = COPY $q0
%1:_(<2 x s64>) = COPY $q1
%4:_(s32) = G_CONSTANT i32 0
%3:_(<2 x s32>) = G_BUILD_VECTOR %4(s32), %4(s32)
%2:_(<2 x s64>) = G_SHUFFLE_VECTOR %0(<2 x s64>), %1, %3(<2 x s32>)
$q0 = COPY %2(<2 x s64>)
RET_ReallyLR implicit $q0
...

View File

@ -313,7 +313,7 @@
# DEBUG: .. type index coverage check SKIPPED: user-defined predicate detected
#
# DEBUG-NEXT: G_SHUFFLE_VECTOR (opcode {{[0-9]+}}): 3 type indices
# DEBUG: .. type index coverage check SKIPPED: no rules defined
# DEBUG: .. type index coverage check SKIPPED: user-defined predicate detected
#
# DEBUG-NEXT: G_CTTZ (opcode {{[0-9]+}}): 2 type indices
# DEBUG: .. type index coverage check SKIPPED: no rules defined

View File

@ -0,0 +1,151 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs %s -global-isel-abort=1 -o - | FileCheck %s
--- |
; ModuleID = 'shufflevec-only-legal.ll'
source_filename = "shufflevec-only-legal.ll"
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64"
define <4 x i32> @shuffle_v4i32(<4 x i32> %a, <4 x i32> %b) {
%shuf = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 3, i32 0>
ret <4 x i32> %shuf
}
define <4 x i32> @shuffle_tbl_v4i32(<4 x i32> %a, <4 x i32> %b) {
%shuf = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
ret <4 x i32> %shuf
}
define <2 x i64> @shuffle_v2i64(<2 x i64> %a, <2 x i64> %b) {
%shuf = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> zeroinitializer
ret <2 x i64> %shuf
}
...
---
name: shuffle_v4i32
alignment: 2
legalized: true
regBankSelected: true
tracksRegLiveness: true
registers:
- { id: 0, class: fpr }
- { id: 1, class: fpr }
- { id: 2, class: fpr }
- { id: 3, class: fpr }
- { id: 4, class: gpr }
- { id: 5, class: gpr }
- { id: 6, class: gpr }
body: |
bb.1 (%ir-block.0):
liveins: $q0, $q1
; CHECK-LABEL: name: shuffle_v4i32
; CHECK: constants:
; CHECK: value: '<16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3>'
; CHECK: alignment: 8
; CHECK: isTargetSpecific: false
; CHECK: liveins: $q0, $q1
; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
; CHECK: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1
; CHECK: [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) %const.0
; CHECK: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) %const.0
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:qq = REG_SEQUENCE [[COPY]], %subreg.qsub0, [[COPY1]], %subreg.qsub1
; CHECK: [[TBLv16i8Two:%[0-9]+]]:fpr128 = TBLv16i8Two [[REG_SEQUENCE]], [[LDRQui]]
; CHECK: $q0 = COPY [[TBLv16i8Two]]
; CHECK: RET_ReallyLR implicit $q0
%0:fpr(<4 x s32>) = COPY $q0
%1:fpr(<4 x s32>) = COPY $q1
%4:gpr(s32) = G_CONSTANT i32 0
%5:gpr(s32) = G_CONSTANT i32 1
%6:gpr(s32) = G_CONSTANT i32 3
%3:fpr(<4 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32), %6(s32), %4(s32)
%2:fpr(<4 x s32>) = G_SHUFFLE_VECTOR %0(<4 x s32>), %1, %3(<4 x s32>)
$q0 = COPY %2(<4 x s32>)
RET_ReallyLR implicit $q0
...
---
name: shuffle_tbl_v4i32
alignment: 2
legalized: true
regBankSelected: true
tracksRegLiveness: true
registers:
- { id: 0, class: fpr }
- { id: 1, class: fpr }
- { id: 2, class: fpr }
- { id: 3, class: fpr }
- { id: 4, class: gpr }
- { id: 5, class: gpr }
- { id: 6, class: gpr }
- { id: 7, class: gpr }
body: |
bb.1 (%ir-block.0):
liveins: $q0, $q1
; CHECK-LABEL: name: shuffle_tbl_v4i32
; CHECK: constants:
; CHECK: value: '<16 x i8> <i8 20, i8 21, i8 22, i8 23, i8 28, i8 29, i8 30, i8 31, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3>'
; CHECK: alignment: 8
; CHECK: isTargetSpecific: false
; CHECK: liveins: $q0, $q1
; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
; CHECK: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1
; CHECK: [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) %const.0
; CHECK: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) %const.0
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:qq = REG_SEQUENCE [[COPY]], %subreg.qsub0, [[COPY1]], %subreg.qsub1
; CHECK: [[TBLv16i8Two:%[0-9]+]]:fpr128 = TBLv16i8Two [[REG_SEQUENCE]], [[LDRQui]]
; CHECK: $q0 = COPY [[TBLv16i8Two]]
; CHECK: RET_ReallyLR implicit $q0
%0:fpr(<4 x s32>) = COPY $q0
%1:fpr(<4 x s32>) = COPY $q1
%4:gpr(s32) = G_CONSTANT i32 5
%5:gpr(s32) = G_CONSTANT i32 7
%6:gpr(s32) = G_CONSTANT i32 1
%7:gpr(s32) = G_CONSTANT i32 0
%3:fpr(<4 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32), %6(s32), %7(s32)
%2:fpr(<4 x s32>) = G_SHUFFLE_VECTOR %0(<4 x s32>), %1, %3(<4 x s32>)
$q0 = COPY %2(<4 x s32>)
RET_ReallyLR implicit $q0
...
---
name: shuffle_v2i64
alignment: 2
legalized: true
regBankSelected: true
tracksRegLiveness: true
registers:
- { id: 0, class: fpr }
- { id: 1, class: fpr }
- { id: 2, class: fpr }
- { id: 3, class: fpr }
- { id: 4, class: gpr }
body: |
bb.1 (%ir-block.0):
liveins: $q0, $q1
; CHECK-LABEL: name: shuffle_v2i64
; CHECK: constants:
; CHECK: value: '<16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>'
; CHECK: alignment: 8
; CHECK: isTargetSpecific: false
; CHECK: liveins: $q0, $q1
; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
; CHECK: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1
; CHECK: [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) %const.0
; CHECK: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) %const.0
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:qq = REG_SEQUENCE [[COPY]], %subreg.qsub0, [[COPY1]], %subreg.qsub1
; CHECK: [[TBLv16i8Two:%[0-9]+]]:fpr128 = TBLv16i8Two [[REG_SEQUENCE]], [[LDRQui]]
; CHECK: $q0 = COPY [[TBLv16i8Two]]
; CHECK: RET_ReallyLR implicit $q0
%0:fpr(<2 x s64>) = COPY $q0
%1:fpr(<2 x s64>) = COPY $q1
%4:gpr(s32) = G_CONSTANT i32 0
%3:fpr(<2 x s32>) = G_BUILD_VECTOR %4(s32), %4(s32)
%2:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %0(<2 x s64>), %1, %3(<2 x s32>)
$q0 = COPY %2(<2 x s64>)
RET_ReallyLR implicit $q0
...