llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner...

//=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass does combining of machine instructions at the generic MI level,
// after the legalizer.
//
//===----------------------------------------------------------------------===//

#include "AMDGPUTargetMachine.h"
#include "AMDGPULegalizerInfo.h"
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/Support/Debug.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"

#define DEBUG_TYPE "amdgpu-postlegalizer-combiner"

using namespace llvm;
using namespace MIPatternMatch;

struct FMinFMaxLegacyInfo {
  Register LHS;
  Register RHS;
  Register True;
  Register False;
  CmpInst::Predicate Pred;
};

// TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI,
                                MachineFunction &MF, FMinFMaxLegacyInfo &Info) {
  // FIXME: Combines should have subtarget predicates, and we shouldn't need
  // this here.
  if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
    return false;

  // FIXME: Type predicate on pattern
  if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
    return false;

  Register Cond = MI.getOperand(1).getReg();
  if (!MRI.hasOneNonDBGUse(Cond) ||
      !mi_match(Cond, MRI,
                m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
    return false;

  Info.True = MI.getOperand(2).getReg();
  Info.False = MI.getOperand(3).getReg();

  if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
      !(Info.LHS == Info.False && Info.RHS == Info.True))
    return false;

  switch (Info.Pred) {
  case CmpInst::FCMP_FALSE:
  case CmpInst::FCMP_OEQ:
  case CmpInst::FCMP_ONE:
  case CmpInst::FCMP_ORD:
  case CmpInst::FCMP_UNO:
  case CmpInst::FCMP_UEQ:
  case CmpInst::FCMP_UNE:
  case CmpInst::FCMP_TRUE:
    return false;
  default:
    return true;
  }
}

static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
                                              const FMinFMaxLegacyInfo &Info) {

  auto buildNewInst = [&MI](unsigned Opc, Register X, Register Y) {
    MachineIRBuilder MIB(MI);
    MIB.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
  };

  switch (Info.Pred) {
  case CmpInst::FCMP_ULT:
  case CmpInst::FCMP_ULE:
    if (Info.LHS == Info.True)
      buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
    else
      buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
    break;
  case CmpInst::FCMP_OLE:
  case CmpInst::FCMP_OLT: {
    // We need to permute the operands to get the correct NaN behavior. The
    // selected operand is the second one based on the failing compare with NaN,
    // so permute it based on the compare type the hardware uses.
    if (Info.LHS == Info.True)
      buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
    else
      buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
    break;
  }
  case CmpInst::FCMP_UGE:
  case CmpInst::FCMP_UGT: {
    if (Info.LHS == Info.True)
      buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
    else
      buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
    break;
  }
  case CmpInst::FCMP_OGT:
  case CmpInst::FCMP_OGE: {
    if (Info.LHS == Info.True)
      buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
    else
      buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
    break;
  }
  default:
    llvm_unreachable("predicate should not have matched");
  }

  MI.eraseFromParent();
}

static bool matchUCharToFloat(MachineInstr &MI, MachineRegisterInfo &MRI,
                              MachineFunction &MF, CombinerHelper &Helper) {
  Register DstReg = MI.getOperand(0).getReg();

  // TODO: We could try to match extracting the higher bytes, which would be
  // easier if i8 vectors weren't promoted to i32 vectors, particularly after
  // types are legalized. v4i8 -> v4f32 is probably the only case to worry
  // about in practice.
  LLT Ty = MRI.getType(DstReg);
  if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
    Register SrcReg = MI.getOperand(1).getReg();
    unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
    assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
    const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
    return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
  }

  return false;
}

static void applyUCharToFloat(MachineInstr &MI) {
  MachineIRBuilder B(MI);

  const LLT S32 = LLT::scalar(32);

  Register DstReg = MI.getOperand(0).getReg();
  Register SrcReg = MI.getOperand(1).getReg();
  LLT Ty = B.getMRI()->getType(DstReg);
  LLT SrcTy = B.getMRI()->getType(SrcReg);
  if (SrcTy != S32)
    SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);

  if (Ty == S32) {
    B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg},
                   {SrcReg}, MI.getFlags());
  } else {
    auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},
                             {SrcReg}, MI.getFlags());
    B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
  }

  MI.eraseFromParent();
}

// FIXME: Should be able to have 2 separate matchdatas rather than custom struct
// boilerplate.
struct CvtF32UByteMatchInfo {
  Register CvtVal;
  unsigned ShiftOffset;
};

static bool matchCvtF32UByteN(MachineInstr &MI, MachineRegisterInfo &MRI,
                              MachineFunction &MF,
                              CvtF32UByteMatchInfo &MatchInfo) {
  Register SrcReg = MI.getOperand(1).getReg();

  // Look through G_ZEXT.
  mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));

  Register Src0;
  int64_t ShiftAmt;
  bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
  if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
    const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;

    unsigned ShiftOffset = 8 * Offset;
    if (IsShr)
      ShiftOffset += ShiftAmt;
    else
      ShiftOffset -= ShiftAmt;

    MatchInfo.CvtVal = Src0;
    MatchInfo.ShiftOffset = ShiftOffset;
    return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
  }

  // TODO: Simplify demanded bits.
  return false;
}

static void applyCvtF32UByteN(MachineInstr &MI,
                              const CvtF32UByteMatchInfo &MatchInfo) {
  MachineIRBuilder B(MI);
  unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;

  const LLT S32 = LLT::scalar(32);
  Register CvtSrc = MatchInfo.CvtVal;
  LLT SrcTy = B.getMRI()->getType(MatchInfo.CvtVal);
  if (SrcTy != S32) {
    assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
    CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
  }

  assert(MI.getOpcode() != NewOpc);
  B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
  MI.eraseFromParent();
}

#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
#include "AMDGPUGenPostLegalizeGICombiner.inc"
#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS

namespace {
#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
#include "AMDGPUGenPostLegalizeGICombiner.inc"
#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H

class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo {
  GISelKnownBits *KB;
  MachineDominatorTree *MDT;

public:
  AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;

  AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
                                  const AMDGPULegalizerInfo *LI,
                                  GISelKnownBits *KB, MachineDominatorTree *MDT)
      : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
                     /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
        KB(KB), MDT(MDT) {
    if (!GeneratedRuleCfg.parseCommandLineOption())
      report_fatal_error("Invalid rule identifier");
  }

  bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
               MachineIRBuilder &B) const override;
};

bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
                                              MachineInstr &MI,
                                              MachineIRBuilder &B) const {
  CombinerHelper Helper(Observer, B, KB, MDT, LInfo);
  AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg);

  if (Generated.tryCombineAll(Observer, MI, B, Helper))
    return true;

  switch (MI.getOpcode()) {
  case TargetOpcode::G_SHL:
  case TargetOpcode::G_LSHR:
  case TargetOpcode::G_ASHR:
    // On some subtargets, 64-bit shift is a quarter rate instruction. In the
    // common case, splitting this into a move and a 32-bit shift is faster and
    // the same code size.
    return Helper.tryCombineShiftToUnmerge(MI, 32);
  }

  return false;
}

#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
#include "AMDGPUGenPostLegalizeGICombiner.inc"
#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP

// Pass boilerplate
// ================

class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
public:
  static char ID;

  AMDGPUPostLegalizerCombiner(bool IsOptNone = false);

  StringRef getPassName() const override {
    return "AMDGPUPostLegalizerCombiner";
  }

  bool runOnMachineFunction(MachineFunction &MF) override;

  void getAnalysisUsage(AnalysisUsage &AU) const override;
private:
  bool IsOptNone;
};
} // end anonymous namespace

void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
  AU.addRequired<TargetPassConfig>();
  AU.setPreservesCFG();
  getSelectionDAGFallbackAnalysisUsage(AU);
  AU.addRequired<GISelKnownBitsAnalysis>();
  AU.addPreserved<GISelKnownBitsAnalysis>();
  if (!IsOptNone) {
    AU.addRequired<MachineDominatorTree>();
    AU.addPreserved<MachineDominatorTree>();
  }
  MachineFunctionPass::getAnalysisUsage(AU);
}

AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
  : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
  initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
}

bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
  if (MF.getProperties().hasProperty(
          MachineFunctionProperties::Property::FailedISel))
    return false;
  auto *TPC = &getAnalysis<TargetPassConfig>();
  const Function &F = MF.getFunction();
  bool EnableOpt =
      MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);

  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
  const AMDGPULegalizerInfo *LI
    = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());

  GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
  MachineDominatorTree *MDT =
      IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
  AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
                                         F.hasMinSize(), LI, KB, MDT);
  Combiner C(PCInfo, TPC);
  return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
}

char AMDGPUPostLegalizerCombiner::ID = 0;
INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
                      "Combine AMDGPU machine instrs after legalization",
                      false, false)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
                    "Combine AMDGPU machine instrs after legalization", false,
                    false)

namespace llvm {
FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
  return new AMDGPUPostLegalizerCombiner(IsOptNone);
}
} // end namespace llvm
AMDGPU/GlobalISel: Introduce post-legalize combiner The current set of custom combines are only really useful after legalization, so move them there. There is a lot of overlap in the boilerplate here, but I think we do want a pretty different set of combines before and after legalize. I think we will want a lot of overlap between the post-legalize and a post-regbankselect combiner. 2020-02-18 00:47:45 +08:00			`//=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//`
			`//`
			`// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.`
			`// See https://llvm.org/LICENSE.txt for license information.`
			`// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception`
			`//`
			`//===----------------------------------------------------------------------===//`
			`//`
			`// This pass does combining of machine instructions at the generic MI level,`
			`// after the legalizer.`
			`//`
			`//===----------------------------------------------------------------------===//`

			`#include "AMDGPUTargetMachine.h"`
			`#include "AMDGPULegalizerInfo.h"`
			`#include "llvm/CodeGen/GlobalISel/Combiner.h"`
			`#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"`
			`#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"`
			`#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"`
			`#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"`
			`#include "llvm/CodeGen/MachineDominators.h"`
			`#include "llvm/CodeGen/MachineFunctionPass.h"`
			`#include "llvm/CodeGen/TargetPassConfig.h"`
			`#include "llvm/Support/Debug.h"`
			`#include "MCTargetDesc/AMDGPUMCTargetDesc.h"`

			`#define DEBUG_TYPE "amdgpu-postlegalizer-combiner"`

			`using namespace llvm;`
			`using namespace MIPatternMatch;`

			`struct FMinFMaxLegacyInfo {`
			`Register LHS;`
			`Register RHS;`
			`Register True;`
			`Register False;`
			`CmpInst::Predicate Pred;`
			`};`

			`// TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize`
			`static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI,`
			`MachineFunction &MF, FMinFMaxLegacyInfo &Info) {`
			`// FIXME: Combines should have subtarget predicates, and we shouldn't need`
			`// this here.`
			`if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())`
			`return false;`

			`// FIXME: Type predicate on pattern`
			`if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))`
			`return false;`

			`Register Cond = MI.getOperand(1).getReg();`
			`if (!MRI.hasOneNonDBGUse(Cond) \|\|`
			`!mi_match(Cond, MRI,`
			`m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))`
			`return false;`

			`Info.True = MI.getOperand(2).getReg();`
			`Info.False = MI.getOperand(3).getReg();`

			`if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&`
			`!(Info.LHS == Info.False && Info.RHS == Info.True))`
			`return false;`

			`switch (Info.Pred) {`
			`case CmpInst::FCMP_FALSE:`
			`case CmpInst::FCMP_OEQ:`
			`case CmpInst::FCMP_ONE:`
			`case CmpInst::FCMP_ORD:`
			`case CmpInst::FCMP_UNO:`
			`case CmpInst::FCMP_UEQ:`
			`case CmpInst::FCMP_UNE:`
			`case CmpInst::FCMP_TRUE:`
			`return false;`
			`default:`
			`return true;`
			`}`
			`}`

			`static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,`
			`const FMinFMaxLegacyInfo &Info) {`

			`auto buildNewInst = [&MI](unsigned Opc, Register X, Register Y) {`
			`MachineIRBuilder MIB(MI);`
			`MIB.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());`
			`};`

			`switch (Info.Pred) {`
			`case CmpInst::FCMP_ULT:`
			`case CmpInst::FCMP_ULE:`
			`if (Info.LHS == Info.True)`
			`buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);`
			`else`
			`buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);`
			`break;`
			`case CmpInst::FCMP_OLE:`
			`case CmpInst::FCMP_OLT: {`
			`// We need to permute the operands to get the correct NaN behavior. The`
			`// selected operand is the second one based on the failing compare with NaN,`
			`// so permute it based on the compare type the hardware uses.`
			`if (Info.LHS == Info.True)`
			`buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);`
			`else`
			`buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);`
			`break;`
			`}`
			`case CmpInst::FCMP_UGE:`
			`case CmpInst::FCMP_UGT: {`
			`if (Info.LHS == Info.True)`
			`buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);`
			`else`
			`buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);`
			`break;`
			`}`
			`case CmpInst::FCMP_OGT:`
			`case CmpInst::FCMP_OGE: {`
			`if (Info.LHS == Info.True)`
			`buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);`
			`else`
			`buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);`
			`break;`
			`}`
			`default:`
			`llvm_unreachable("predicate should not have matched");`
			`}`

			`MI.eraseFromParent();`
			`}`

AMDGPU/GlobalISel: Form CVT_F32_UBYTE0 2020-03-25 22:45:07 +08:00			`static bool matchUCharToFloat(MachineInstr &MI, MachineRegisterInfo &MRI,`
			`MachineFunction &MF, CombinerHelper &Helper) {`
			`Register DstReg = MI.getOperand(0).getReg();`

			`// TODO: We could try to match extracting the higher bytes, which would be`
			`// easier if i8 vectors weren't promoted to i32 vectors, particularly after`
			`// types are legalized. v4i8 -> v4f32 is probably the only case to worry`
			`// about in practice.`
			`LLT Ty = MRI.getType(DstReg);`
			`if (Ty == LLT::scalar(32) \|\| Ty == LLT::scalar(16)) {`
AMDGPU/GlobalISel: Fix asserts on non-s32 sitofp/uitofp sources The combine to form cvt_f32_ubyte0 was assuming the source type was always 32-bit, but this needs to tolerate any legal source type. 2020-06-12 22:19:17 +08:00			`Register SrcReg = MI.getOperand(1).getReg();`
			`unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();`
			`assert(SrcSize == 16 \|\| SrcSize == 32 \|\| SrcSize == 64);`
			`const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);`
			`return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);`
AMDGPU/GlobalISel: Form CVT_F32_UBYTE0 2020-03-25 22:45:07 +08:00			`}`

			`return false;`
			`}`

			`static void applyUCharToFloat(MachineInstr &MI) {`
			`MachineIRBuilder B(MI);`

			`const LLT S32 = LLT::scalar(32);`

			`Register DstReg = MI.getOperand(0).getReg();`
AMDGPU/GlobalISel: Fix asserts on non-s32 sitofp/uitofp sources The combine to form cvt_f32_ubyte0 was assuming the source type was always 32-bit, but this needs to tolerate any legal source type. 2020-06-12 22:19:17 +08:00			`Register SrcReg = MI.getOperand(1).getReg();`
AMDGPU/GlobalISel: Form CVT_F32_UBYTE0 2020-03-25 22:45:07 +08:00			`LLT Ty = B.getMRI()->getType(DstReg);`
AMDGPU/GlobalISel: Fix asserts on non-s32 sitofp/uitofp sources The combine to form cvt_f32_ubyte0 was assuming the source type was always 32-bit, but this needs to tolerate any legal source type. 2020-06-12 22:19:17 +08:00			`LLT SrcTy = B.getMRI()->getType(SrcReg);`
			`if (SrcTy != S32)`
			`SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);`
AMDGPU/GlobalISel: Form CVT_F32_UBYTE0 2020-03-25 22:45:07 +08:00
			`if (Ty == S32) {`
			`B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg},`
AMDGPU/GlobalISel: Fix asserts on non-s32 sitofp/uitofp sources The combine to form cvt_f32_ubyte0 was assuming the source type was always 32-bit, but this needs to tolerate any legal source type. 2020-06-12 22:19:17 +08:00			`{SrcReg}, MI.getFlags());`
AMDGPU/GlobalISel: Form CVT_F32_UBYTE0 2020-03-25 22:45:07 +08:00			`} else {`
			`auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},`
AMDGPU/GlobalISel: Fix asserts on non-s32 sitofp/uitofp sources The combine to form cvt_f32_ubyte0 was assuming the source type was always 32-bit, but this needs to tolerate any legal source type. 2020-06-12 22:19:17 +08:00			`{SrcReg}, MI.getFlags());`
AMDGPU/GlobalISel: Form CVT_F32_UBYTE0 2020-03-25 22:45:07 +08:00			`B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());`
			`}`

			`MI.eraseFromParent();`
			`}`
AMDGPU/GlobalISel: Introduce post-legalize combiner The current set of custom combines are only really useful after legalization, so move them there. There is a lot of overlap in the boilerplate here, but I think we do want a pretty different set of combines before and after legalize. I think we will want a lot of overlap between the post-legalize and a post-regbankselect combiner. 2020-02-18 00:47:45 +08:00
AMDGPU/GlobalISel: Combines for V_CVT_F32_UBYTE[0-3] Ports the existing DAG combines, minus the simplify demanded bits which seems to have no equivalent now. Without these, this isn't particularly helpful in most of the IR sample cases. 2020-03-30 00:34:35 +08:00			`// FIXME: Should be able to have 2 separate matchdatas rather than custom struct`
			`// boilerplate.`
			`struct CvtF32UByteMatchInfo {`
			`Register CvtVal;`
			`unsigned ShiftOffset;`
			`};`

			`static bool matchCvtF32UByteN(MachineInstr &MI, MachineRegisterInfo &MRI,`
			`MachineFunction &MF,`
			`CvtF32UByteMatchInfo &MatchInfo) {`
			`Register SrcReg = MI.getOperand(1).getReg();`

			`// Look through G_ZEXT.`
			`mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));`

			`Register Src0;`
			`int64_t ShiftAmt;`
			`bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));`
			`if (IsShr \|\| mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {`
			`const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;`

			`unsigned ShiftOffset = 8 * Offset;`
			`if (IsShr)`
			`ShiftOffset += ShiftAmt;`
			`else`
			`ShiftOffset -= ShiftAmt;`

			`MatchInfo.CvtVal = Src0;`
			`MatchInfo.ShiftOffset = ShiftOffset;`
			`return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;`
			`}`

			`// TODO: Simplify demanded bits.`
			`return false;`
			`}`

			`static void applyCvtF32UByteN(MachineInstr &MI,`
			`const CvtF32UByteMatchInfo &MatchInfo) {`
			`MachineIRBuilder B(MI);`
			`unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;`

			`const LLT S32 = LLT::scalar(32);`
			`Register CvtSrc = MatchInfo.CvtVal;`
			`LLT SrcTy = B.getMRI()->getType(MatchInfo.CvtVal);`
			`if (SrcTy != S32) {`
			`assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);`
			`CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);`
			`}`

			`assert(MI.getOpcode() != NewOpc);`
			`B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());`
			`MI.eraseFromParent();`
			`}`

AMDGPU/GlobalISel: Introduce post-legalize combiner The current set of custom combines are only really useful after legalization, so move them there. There is a lot of overlap in the boilerplate here, but I think we do want a pretty different set of combines before and after legalize. I think we will want a lot of overlap between the post-legalize and a post-regbankselect combiner. 2020-02-18 00:47:45 +08:00			`#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS`
			`#include "AMDGPUGenPostLegalizeGICombiner.inc"`
			`#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS`

			`namespace {`
			`#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H`
			`#include "AMDGPUGenPostLegalizeGICombiner.inc"`
			`#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H`

AMDGPU/GlobalISel: Mark GlobalISel classes as final 2020-07-26 22:52:51 +08:00			`class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo {`
AMDGPU/GlobalISel: Introduce post-legalize combiner The current set of custom combines are only really useful after legalization, so move them there. There is a lot of overlap in the boilerplate here, but I think we do want a pretty different set of combines before and after legalize. I think we will want a lot of overlap between the post-legalize and a post-regbankselect combiner. 2020-02-18 00:47:45 +08:00			`GISelKnownBits *KB;`
			`MachineDominatorTree *MDT;`

			`public:`
[gicombiner] Allow generated combiners to store additional members Summary: Adds the ability to add members to a generated combiner via a State base class. In the current AArch64PreLegalizerCombiner this is used to make Helper available without having to provide it to every call. As part of this, split the command line processing into a separate object so that it still only runs once even though the generated combiner is constructed more frequently. Depends on D81862 Reviewers: aditya_nandakumar, bogner, volkan, aemerson, paquette, arsenm Reviewed By: arsenm Subscribers: jvesely, wdng, nhaehnle, kristof.beyls, hiraditya, kerbowa, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D81863 2020-06-17 05:15:36 +08:00			`AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;`
AMDGPU/GlobalISel: Introduce post-legalize combiner The current set of custom combines are only really useful after legalization, so move them there. There is a lot of overlap in the boilerplate here, but I think we do want a pretty different set of combines before and after legalize. I think we will want a lot of overlap between the post-legalize and a post-regbankselect combiner. 2020-02-18 00:47:45 +08:00
			`AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,`
			`const AMDGPULegalizerInfo *LI,`
			`GISelKnownBits KB, MachineDominatorTree MDT)`
			`: CombinerInfo(/AllowIllegalOps/ false, /ShouldLegalizeIllegal/ true,`
			`/LegalizerInfo/ LI, EnableOpt, OptSize, MinSize),`
			`KB(KB), MDT(MDT) {`
[gicombiner] Allow generated combiners to store additional members Summary: Adds the ability to add members to a generated combiner via a State base class. In the current AArch64PreLegalizerCombiner this is used to make Helper available without having to provide it to every call. As part of this, split the command line processing into a separate object so that it still only runs once even though the generated combiner is constructed more frequently. Depends on D81862 Reviewers: aditya_nandakumar, bogner, volkan, aemerson, paquette, arsenm Reviewed By: arsenm Subscribers: jvesely, wdng, nhaehnle, kristof.beyls, hiraditya, kerbowa, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D81863 2020-06-17 05:15:36 +08:00			`if (!GeneratedRuleCfg.parseCommandLineOption())`
AMDGPU/GlobalISel: Introduce post-legalize combiner The current set of custom combines are only really useful after legalization, so move them there. There is a lot of overlap in the boilerplate here, but I think we do want a pretty different set of combines before and after legalize. I think we will want a lot of overlap between the post-legalize and a post-regbankselect combiner. 2020-02-18 00:47:45 +08:00			`report_fatal_error("Invalid rule identifier");`
			`}`

AMDGPU/GlobalISel: Remove redundant virtual 2020-03-29 07:04:47 +08:00			`bool combine(GISelChangeObserver &Observer, MachineInstr &MI,`
			`MachineIRBuilder &B) const override;`
AMDGPU/GlobalISel: Introduce post-legalize combiner The current set of custom combines are only really useful after legalization, so move them there. There is a lot of overlap in the boilerplate here, but I think we do want a pretty different set of combines before and after legalize. I think we will want a lot of overlap between the post-legalize and a post-regbankselect combiner. 2020-02-18 00:47:45 +08:00			`};`

			`bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,`
			`MachineInstr &MI,`
			`MachineIRBuilder &B) const {`
AMDGPU/GlobalISel: Fix using post-legal combiner without LegalizerInfo 2020-08-16 03:14:11 +08:00			`CombinerHelper Helper(Observer, B, KB, MDT, LInfo);`
[gicombiner] Allow generated combiners to store additional members Summary: Adds the ability to add members to a generated combiner via a State base class. In the current AArch64PreLegalizerCombiner this is used to make Helper available without having to provide it to every call. As part of this, split the command line processing into a separate object so that it still only runs once even though the generated combiner is constructed more frequently. Depends on D81862 Reviewers: aditya_nandakumar, bogner, volkan, aemerson, paquette, arsenm Reviewed By: arsenm Subscribers: jvesely, wdng, nhaehnle, kristof.beyls, hiraditya, kerbowa, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D81863 2020-06-17 05:15:36 +08:00			`AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg);`
AMDGPU/GlobalISel: Introduce post-legalize combiner The current set of custom combines are only really useful after legalization, so move them there. There is a lot of overlap in the boilerplate here, but I think we do want a pretty different set of combines before and after legalize. I think we will want a lot of overlap between the post-legalize and a post-regbankselect combiner. 2020-02-18 00:47:45 +08:00
			`if (Generated.tryCombineAll(Observer, MI, B, Helper))`
			`return true;`

			`switch (MI.getOpcode()) {`
			`case TargetOpcode::G_SHL:`
			`case TargetOpcode::G_LSHR:`
			`case TargetOpcode::G_ASHR:`
			`// On some subtargets, 64-bit shift is a quarter rate instruction. In the`
			`// common case, splitting this into a move and a 32-bit shift is faster and`
			`// the same code size.`
			`return Helper.tryCombineShiftToUnmerge(MI, 32);`
			`}`

			`return false;`
			`}`

			`#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP`
			`#include "AMDGPUGenPostLegalizeGICombiner.inc"`
			`#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP`

			`// Pass boilerplate`
			`// ================`

			`class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {`
			`public:`
			`static char ID;`

			`AMDGPUPostLegalizerCombiner(bool IsOptNone = false);`

			`StringRef getPassName() const override {`
			`return "AMDGPUPostLegalizerCombiner";`
			`}`

			`bool runOnMachineFunction(MachineFunction &MF) override;`

			`void getAnalysisUsage(AnalysisUsage &AU) const override;`
			`private:`
			`bool IsOptNone;`
			`};`
			`} // end anonymous namespace`

			`void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {`
			`AU.addRequired<TargetPassConfig>();`
			`AU.setPreservesCFG();`
			`getSelectionDAGFallbackAnalysisUsage(AU);`
			`AU.addRequired<GISelKnownBitsAnalysis>();`
			`AU.addPreserved<GISelKnownBitsAnalysis>();`
			`if (!IsOptNone) {`
			`AU.addRequired<MachineDominatorTree>();`
			`AU.addPreserved<MachineDominatorTree>();`
			`}`
			`MachineFunctionPass::getAnalysisUsage(AU);`
			`}`

			`AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)`
			`: MachineFunctionPass(ID), IsOptNone(IsOptNone) {`
			`initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());`
			`}`

			`bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {`
			`if (MF.getProperties().hasProperty(`
			`MachineFunctionProperties::Property::FailedISel))`
			`return false;`
			`auto *TPC = &getAnalysis<TargetPassConfig>();`
			`const Function &F = MF.getFunction();`
			`bool EnableOpt =`
			`MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);`

			`const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();`
			`const AMDGPULegalizerInfo *LI`
			`= static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());`

			`GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);`
			`MachineDominatorTree *MDT =`
			`IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();`
			`AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),`
			`F.hasMinSize(), LI, KB, MDT);`
			`Combiner C(PCInfo, TPC);`
			`return C.combineMachineInstrs(MF, /CSEInfo/ nullptr);`
			`}`

			`char AMDGPUPostLegalizerCombiner::ID = 0;`
			`INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,`
			`"Combine AMDGPU machine instrs after legalization",`
			`false, false)`
			`INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)`
			`INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)`
			`INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,`
			`"Combine AMDGPU machine instrs after legalization", false,`
			`false)`

			`namespace llvm {`
			`FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {`
			`return new AMDGPUPostLegalizerCombiner(IsOptNone);`
			`}`
			`} // end namespace llvm`