From ed72bcae3467f6a3874ddad13c9212ebce818150 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 19 Feb 2020 13:23:11 -0500 Subject: [PATCH] AMDGPU/GlobalISel: Fix mishandling SGPR v2s16 add/sub/mul We weren't considering the packed case correctly, and this was passing through to the selector. The selector only checked the size, so this would incorrectly compile to a single 32-bit scalar add. As usual, the LegalizerHelper is somewhat awkward to use from applyMappingImpl. I think this is the first place we've needed multi-step legalization here though. --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 6 +- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 24 ++- .../inst-select-scalar-packed.xfail.mir | 184 ++++++++++++++++++ .../GlobalISel/regbankselect-add.s16.mir | 95 +++++++++ ...lect-add.mir => regbankselect-add.s32.mir} | 0 .../GlobalISel/regbankselect-add.v2s16.mir | 82 ++++++++ 6 files changed, 386 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-packed.xfail.mir create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.s16.mir rename llvm/test/CodeGen/AMDGPU/GlobalISel/{regbankselect-add.mir => regbankselect-add.s32.mir} (100%) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.v2s16.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 287ad625db41..b79c44604dab 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -323,7 +323,11 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { MachineFunction *MF = BB->getParent(); Register DstReg = I.getOperand(0).getReg(); const DebugLoc &DL = I.getDebugLoc(); - unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); + LLT Ty = MRI->getType(DstReg); + if (Ty.isVector()) + return false; + + unsigned Size = Ty.getSizeInBits(); const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index de21581052f4..042fceeae815 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2080,7 +2080,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl( case AMDGPU::G_MUL: { Register DstReg = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(DstReg); - if (DstTy != LLT::scalar(16)) + const LLT S32 = LLT::scalar(32); + if (DstTy == S32) break; const RegisterBank *DstBank = @@ -2089,15 +2090,30 @@ void AMDGPURegisterBankInfo::applyMappingImpl( break; // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. + // Packed 16-bit operations need to be scalarized and promoted. + MachineFunction *MF = MI.getParent()->getParent(); MachineIRBuilder B(MI); ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); GISelObserverWrapper Observer(&ApplySALU); LegalizerHelper Helper(*MF, Observer, B); - if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != - LegalizerHelper::Legalized) - llvm_unreachable("widen scalar should have succeeded"); + if (DstTy.isVector()) { + // FIXME: Multi-step legalization is awkward here. We're relying on the + // fact that widenScalar leaves the instruction in place in this case, and + // we have to do it in this order. + if (Helper.widenScalar(MI, 0, LLT::vector(2, 32)) != + LegalizerHelper::Legalized) + llvm_unreachable("widen scalar should have succeeded"); + + if (Helper.fewerElementsVector(MI, 0, S32) != LegalizerHelper::Legalized) + llvm_unreachable("fewerElementsVector should have succeeded"); + + } else { + if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) + llvm_unreachable("widen scalar should have succeeded"); + } + return; } case AMDGPU::G_SMIN: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-packed.xfail.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-packed.xfail.mir new file mode 100644 index 000000000000..9899944b5eb2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-packed.xfail.mir @@ -0,0 +1,184 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s + +# Make sure v2s16 SALU operations fail to select + +# ERR: remark: :0:0: cannot select: %2:sgpr(<2 x s16>) = G_ADD %0:sgpr, %1:sgpr (in function: s_add_v2s16) +# ERR: remark: :0:0: cannot select: %2:sgpr(<2 x s16>) = G_SUB %0:sgpr, %1:sgpr (in function: s_sub_v2s16) +# ERR: remark: :0:0: cannot select: %2:sgpr(<2 x s16>) = G_MUL %0:sgpr, %1:sgpr (in function: s_mul_v2s16) +# ERR: remark: :0:0: cannot select: %2:sgpr(<2 x s16>) = G_SHL %0:sgpr, %1:sgpr(<2 x s16>) (in function: s_shl_v2s16) +# ERR: remark: :0:0: cannot select: %2:sgpr(<2 x s16>) = G_LSHR %0:sgpr, %1:sgpr(<2 x s16>) (in function: s_lshr_v2s16) +# ERR: remark: :0:0: cannot select: %2:sgpr(<2 x s16>) = G_ASHR %0:sgpr, %1:sgpr(<2 x s16>) (in function: s_ashr_v2s16) +# ERR: remark: :0:0: cannot select: %2:sgpr(<2 x s16>) = G_SMIN %0:sgpr, %1:sgpr (in function: s_smin_v2s16) +# ERR: remark: :0:0: cannot select: %2:sgpr(<2 x s16>) = G_SMAX %0:sgpr, %1:sgpr (in function: s_smax_v2s16) +# ERR: remark: :0:0: cannot select: %2:sgpr(<2 x s16>) = G_UMIN %0:sgpr, %1:sgpr (in function: s_umin_v2s16) +# ERR: remark: :0:0: cannot select: %2:sgpr(<2 x s16>) = G_UMAX %0:sgpr, %1:sgpr (in function: s_umax_v2s16) + +--- +name: s_add_v2s16 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + %0:sgpr(<2 x s16>) = COPY $sgpr0 + %1:sgpr(<2 x s16>) = COPY $sgpr1 + %2:sgpr(<2 x s16>) = G_ADD %0, %1 + S_ENDPGM 0, implicit %2 + +... + +--- +name: s_sub_v2s16 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + %0:sgpr(<2 x s16>) = COPY $sgpr0 + %1:sgpr(<2 x s16>) = COPY $sgpr1 + %2:sgpr(<2 x s16>) = G_SUB %0, %1 + S_ENDPGM 0, implicit %2 + +... + +--- +name: s_mul_v2s16 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + %0:sgpr(<2 x s16>) = COPY $sgpr0 + %1:sgpr(<2 x s16>) = COPY $sgpr1 + %2:sgpr(<2 x s16>) = G_MUL %0, %1 + S_ENDPGM 0, implicit %2 + +... + +--- +name: s_shl_v2s16 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + %0:sgpr(<2 x s16>) = COPY $sgpr0 + %1:sgpr(<2 x s16>) = COPY $sgpr1 + %2:sgpr(<2 x s16>) = G_SHL %0, %1 + S_ENDPGM 0, implicit %2 + +... + +--- +name: s_lshr_v2s16 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + %0:sgpr(<2 x s16>) = COPY $sgpr0 + %1:sgpr(<2 x s16>) = COPY $sgpr1 + %2:sgpr(<2 x s16>) = G_LSHR %0, %1 + S_ENDPGM 0, implicit %2 + +... + +--- +name: s_ashr_v2s16 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + %0:sgpr(<2 x s16>) = COPY $sgpr0 + %1:sgpr(<2 x s16>) = COPY $sgpr1 + %2:sgpr(<2 x s16>) = G_ASHR %0, %1 + S_ENDPGM 0, implicit %2 + +... + +--- +name: s_smin_v2s16 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + %0:sgpr(<2 x s16>) = COPY $sgpr0 + %1:sgpr(<2 x s16>) = COPY $sgpr1 + %2:sgpr(<2 x s16>) = G_SMIN %0, %1 + S_ENDPGM 0, implicit %2 + +... + +--- +name: s_smax_v2s16 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + %0:sgpr(<2 x s16>) = COPY $sgpr0 + %1:sgpr(<2 x s16>) = COPY $sgpr1 + %2:sgpr(<2 x s16>) = G_SMAX %0, %1 + S_ENDPGM 0, implicit %2 + +... + +--- +name: s_umin_v2s16 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + %0:sgpr(<2 x s16>) = COPY $sgpr0 + %1:sgpr(<2 x s16>) = COPY $sgpr1 + %2:sgpr(<2 x s16>) = G_UMIN %0, %1 + S_ENDPGM 0, implicit %2 + +... + +--- +name: s_umax_v2s16 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + %0:sgpr(<2 x s16>) = COPY $sgpr0 + %1:sgpr(<2 x s16>) = COPY $sgpr1 + %2:sgpr(<2 x s16>) = G_UMAX %0, %1 + S_ENDPGM 0, implicit %2 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.s16.mir new file mode 100644 index 000000000000..189ee09638f2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.s16.mir @@ -0,0 +1,95 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s +--- +name: add_s16_ss +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + ; CHECK-LABEL: name: add_s16_ss + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s16) + ; CHECK: [[ANYEXT1:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC1]](s16) + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[ANYEXT]], [[ANYEXT1]] + ; CHECK: [[TRUNC2:%[0-9]+]]:sgpr(s16) = G_TRUNC [[ADD]](s32) + ; CHECK: S_ENDPGM 0, implicit [[TRUNC2]](s16) + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $sgpr1 + %2:_(s16) = G_TRUNC %0 + %3:_(s16) = G_TRUNC %1 + %4:_(s16) = G_ADD %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: add_s16_sv +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr0 + ; CHECK-LABEL: name: add_s16_sv + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s16) = COPY [[TRUNC]](s16) + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s16) = G_ADD [[COPY2]], [[TRUNC1]] + ; CHECK: S_ENDPGM 0, implicit [[ADD]](s16) + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $vgpr0 + %2:_(s16) = G_TRUNC %0 + %3:_(s16) = G_TRUNC %1 + %4:_(s16) = G_ADD %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: add_s16_vs +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr0 + ; CHECK-LABEL: name: add_s16_vs + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s16) = COPY [[TRUNC1]](s16) + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s16) = G_ADD [[TRUNC]], [[COPY2]] + ; CHECK: S_ENDPGM 0, implicit [[ADD]](s16) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $sgpr0 + %2:_(s16) = G_TRUNC %0 + %3:_(s16) = G_TRUNC %1 + %4:_(s16) = G_ADD %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: add_s16_vv +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: add_s16_vv + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s16) = G_ADD [[TRUNC]], [[TRUNC1]] + ; CHECK: S_ENDPGM 0, implicit [[ADD]](s16) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %0 + %3:_(s16) = G_TRUNC %1 + %4:_(s16) = G_ADD %2, %3 + S_ENDPGM 0, implicit %4 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.s32.mir similarity index 100% rename from llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.mir rename to llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.s32.mir diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.v2s16.mir new file mode 100644 index 000000000000..e116106291ff --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.v2s16.mir @@ -0,0 +1,82 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s + +--- +name: add_v2s16_ss +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + ; CHECK-LABEL: name: add_v2s16_ss + ; CHECK: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr1 + ; CHECK: [[ANYEXT:%[0-9]+]]:sgpr(<2 x s32>) = G_ANYEXT [[COPY]](<2 x s16>) + ; CHECK: [[ANYEXT1:%[0-9]+]]:sgpr(<2 x s32>) = G_ANYEXT [[COPY1]](<2 x s16>) + ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[ANYEXT]](<2 x s32>) + ; CHECK: [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[ANYEXT1]](<2 x s32>) + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[UV]], [[UV2]] + ; CHECK: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[UV1]], [[UV3]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<2 x s32>) = G_BUILD_VECTOR [[ADD]](s32), [[ADD1]](s32) + ; CHECK: [[TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_TRUNC [[BUILD_VECTOR]](<2 x s32>) + ; CHECK: S_ENDPGM 0, implicit [[TRUNC]](<2 x s16>) + %0:_(<2 x s16>) = COPY $sgpr0 + %1:_(<2 x s16>) = COPY $sgpr1 + %2:_(<2 x s16>) = G_ADD %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: add_v2s16_sv +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr0 + ; CHECK-LABEL: name: add_v2s16_sv + ; CHECK: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[COPY]](<2 x s16>) + ; CHECK: [[ADD:%[0-9]+]]:vgpr(<2 x s16>) = G_ADD [[COPY2]], [[COPY1]] + ; CHECK: S_ENDPGM 0, implicit [[ADD]](<2 x s16>) + %0:_(<2 x s16>) = COPY $sgpr0 + %1:_(<2 x s16>) = COPY $vgpr0 + %2:_(<2 x s16>) = G_ADD %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: add_v2s16_vs +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr0 + ; CHECK-LABEL: name: add_v2s16_vs + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[COPY1]](<2 x s16>) + ; CHECK: [[ADD:%[0-9]+]]:vgpr(<2 x s16>) = G_ADD [[COPY]], [[COPY2]] + %0:_(<2 x s16>) = COPY $vgpr0 + %1:_(<2 x s16>) = COPY $sgpr0 + %2:_(<2 x s16>) = G_ADD %0, %1 +... + +--- +name: add_v2s16_vv +legalized: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: add_v2s16_vv + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(<2 x s16>) = G_ADD [[COPY]], [[COPY1]] + ; CHECK: S_ENDPGM 0, implicit [[ADD]](<2 x s16>) + %0:_(<2 x s16>) = COPY $vgpr0 + %1:_(<2 x s16>) = COPY $vgpr1 + %2:_(<2 x s16>) = G_ADD %0, %1 + S_ENDPGM 0, implicit %2 +...