From a0933e6df759787ab7ce4622f693d9b8df774536 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 9 Sep 2019 18:57:51 +0000 Subject: [PATCH] AMDGPU/GlobalISel: Legalize G_BUILD_VECTOR v2s16 Handle it the same way as G_BUILD_VECTOR_TRUNC. Arguably only G_BUILD_VECTOR_TRUNC should be legal for this, but G_BUILD_VECTOR will probably be more convenient in most cases. llvm-svn: 371440 --- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 21 ++-- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 77 ++++++++++----- .../regbankselect-build-vector.v2s16.mir | 99 +++++++++++++++++++ 3 files changed, 163 insertions(+), 34 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-build-vector.v2s16.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 7234b7f799e7..06492a1db850 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -713,14 +713,19 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, } - getActionDefinitionsBuilder(G_BUILD_VECTOR) - .legalForCartesianProduct(AllS32Vectors, {S32}) - .legalForCartesianProduct(AllS64Vectors, {S64}) - .clampNumElements(0, V16S32, V16S32) - .clampNumElements(0, V2S64, V8S64) - .minScalarSameAs(1, 0) - .legalIf(isRegisterType(0)) - .minScalarOrElt(0, S32); + auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) + .legalForCartesianProduct(AllS32Vectors, {S32}) + .legalForCartesianProduct(AllS64Vectors, {S64}) + .clampNumElements(0, V16S32, V16S32) + .clampNumElements(0, V2S64, V8S64); + + if (ST.hasScalarPackInsts()) + BuildVector.legalFor({V2S16, S32}); + + BuildVector + .minScalarSameAs(1, 0) + .legalIf(isRegisterType(0)) + .minScalarOrElt(0, S32); if (ST.hasScalarPackInsts()) { getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index aff746a413e7..c5ec496c4ede 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1305,12 +1305,17 @@ void AMDGPURegisterBankInfo::applyMappingImpl( MI.eraseFromParent(); return; } + case AMDGPU::G_BUILD_VECTOR: case AMDGPU::G_BUILD_VECTOR_TRUNC: { + Register DstReg = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + if (DstTy != LLT::vector(2, 16)) + break; + assert(MI.getNumOperands() == 3 && empty(OpdMapper.getVRegs(0))); substituteSimpleCopyRegs(OpdMapper, 1); substituteSimpleCopyRegs(OpdMapper, 2); - Register DstReg = MI.getOperand(0).getReg(); const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI); if (DstBank == &AMDGPU::SGPRRegBank) break; // Can use S_PACK_* instructions. @@ -1319,24 +1324,41 @@ void AMDGPURegisterBankInfo::applyMappingImpl( Register Lo = MI.getOperand(1).getReg(); Register Hi = MI.getOperand(2).getReg(); + const LLT S32 = LLT::scalar(32); const RegisterBank *BankLo = getRegBank(Lo, MRI, *TRI); const RegisterBank *BankHi = getRegBank(Hi, MRI, *TRI); - const LLT S32 = LLT::scalar(32); - auto MaskLo = B.buildConstant(S32, 0xffff); - MRI.setRegBank(MaskLo.getReg(0), *BankLo); + Register ZextLo; + Register ShiftHi; - auto ShiftAmt = B.buildConstant(S32, 16); - MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); + if (Opc == AMDGPU::G_BUILD_VECTOR) { + ZextLo = B.buildZExt(S32, Lo).getReg(0); + MRI.setRegBank(ZextLo, *BankLo); - auto ShiftHi = B.buildShl(S32, Hi, ShiftAmt); - MRI.setRegBank(ShiftHi.getReg(0), *BankHi); + Register ZextHi = B.buildZExt(S32, Hi).getReg(0); + MRI.setRegBank(ZextHi, *BankHi); - auto Masked = B.buildAnd(S32, Lo, MaskLo); - MRI.setRegBank(Masked.getReg(0), *BankLo); + auto ShiftAmt = B.buildConstant(S32, 16); + MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); - auto Or = B.buildOr(S32, Masked, ShiftHi); + ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0); + MRI.setRegBank(ShiftHi, *BankHi); + } else { + Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0); + MRI.setRegBank(MaskLo, *BankLo); + + auto ShiftAmt = B.buildConstant(S32, 16); + MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); + + ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0); + MRI.setRegBank(ShiftHi, *BankHi); + + ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0); + MRI.setRegBank(ZextLo, *BankLo); + } + + auto Or = B.buildOr(S32, ZextLo, ShiftHi); MRI.setRegBank(Or.getReg(0), *DstBank); B.buildBitcast(DstReg, Or); @@ -1804,8 +1826,25 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = nullptr; break; } - case AMDGPU::G_MERGE_VALUES: case AMDGPU::G_BUILD_VECTOR: + case AMDGPU::G_BUILD_VECTOR_TRUNC: { + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + if (DstTy == LLT::vector(2, 16)) { + unsigned DstSize = DstTy.getSizeInBits(); + unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); + unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); + unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID); + + OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize); + OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize); + OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize); + break; + } + + LLVM_FALLTHROUGH; + } + case AMDGPU::G_MERGE_VALUES: case AMDGPU::G_CONCAT_VECTORS: { unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; @@ -1818,20 +1857,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); break; } - case AMDGPU::G_BUILD_VECTOR_TRUNC: { - assert(MI.getNumOperands() == 3); - - unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); - unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); - unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); - unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); - unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID); - - OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize); - OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize); - OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize); - break; - } case AMDGPU::G_BITCAST: case AMDGPU::G_INTTOPTR: case AMDGPU::G_PTRTOINT: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-build-vector.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-build-vector.v2s16.mir new file mode 100644 index 000000000000..226e0fb549ea --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-build-vector.v2s16.mir @@ -0,0 +1,99 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s + +--- +name: build_vector_v2s16_s32_ss +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + ; CHECK-LABEL: name: build_vector_v2s16_s32_ss + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $sgpr1 + %2:_(s16) = G_TRUNC %0 + %3:_(s16) = G_TRUNC %1 + %4:_(<2 x s16>) = G_BUILD_VECTOR %2, %3 +... + +--- +name: build_vector_v2s16_s32_sv +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr0 + + ; CHECK-LABEL: name: build_vector_v2s16_s32_sv + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s16) + ; CHECK: [[ZEXT1:%[0-9]+]]:vgpr(s32) = G_ZEXT [[TRUNC1]](s16) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 16 + ; CHECK: [[SHL:%[0-9]+]]:vgpr(s32) = G_SHL [[ZEXT1]], [[C]](s32) + ; CHECK: [[OR:%[0-9]+]]:vgpr(s32) = G_OR [[ZEXT]], [[SHL]] + ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR]](s32) + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $vgpr0 + %2:_(s16) = G_TRUNC %0 + %3:_(s16) = G_TRUNC %1 + %4:_(<2 x s16>) = G_BUILD_VECTOR %2, %3 +... + +--- +name: build_vector_v2s16_s32_vs +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $sgpr0 + ; CHECK-LABEL: name: build_vector_v2s16_s32_vs + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[ZEXT:%[0-9]+]]:vgpr(s32) = G_ZEXT [[TRUNC]](s16) + ; CHECK: [[ZEXT1:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC1]](s16) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 + ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[ZEXT1]], [[C]](s32) + ; CHECK: [[OR:%[0-9]+]]:vgpr(s32) = G_OR [[ZEXT]], [[SHL]] + ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $sgpr0 + %2:_(s16) = G_TRUNC %0 + %3:_(s16) = G_TRUNC %1 + %4:_(<2 x s16>) = G_BUILD_VECTOR %2, %3 +... + +--- +name: build_vector_v2s16_s32_vv +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: build_vector_v2s16_s32_vv + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[ZEXT:%[0-9]+]]:vgpr(s32) = G_ZEXT [[TRUNC]](s16) + ; CHECK: [[ZEXT1:%[0-9]+]]:vgpr(s32) = G_ZEXT [[TRUNC1]](s16) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 16 + ; CHECK: [[SHL:%[0-9]+]]:vgpr(s32) = G_SHL [[ZEXT1]], [[C]](s32) + ; CHECK: [[OR:%[0-9]+]]:vgpr(s32) = G_OR [[ZEXT]], [[SHL]] + ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %0 + %3:_(s16) = G_TRUNC %1 + %4:_(<2 x s16>) = G_BUILD_VECTOR %2, %3 +...