llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp

//===-- SIPreEmitPeephole.cpp ------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This pass performs the peephole optimizations before code emission.
///
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/Support/CommandLine.h"

using namespace llvm;

#define DEBUG_TYPE "si-pre-emit-peephole"

namespace {

class SIPreEmitPeephole : public MachineFunctionPass {
private:
  const SIInstrInfo *TII = nullptr;
  const SIRegisterInfo *TRI = nullptr;

  bool optimizeVccBranch(MachineInstr &MI) const;
  bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;

public:
  static char ID;

  SIPreEmitPeephole() : MachineFunctionPass(ID) {
    initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
  }

  bool runOnMachineFunction(MachineFunction &MF) override;
};

} // End anonymous namespace.

INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
                "SI peephole optimizations", false, false)

char SIPreEmitPeephole::ID = 0;

char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;

bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
  // Match:
  // sreg = -1
  // vcc = S_AND_B64 exec, sreg
  // S_CBRANCH_VCC[N]Z
  // =>
  // S_CBRANCH_EXEC[N]Z
  // We end up with this pattern sometimes after basic block placement.
  // It happens while combining a block which assigns -1 to a saved mask and
  // another block which consumes that saved mask and then a branch.
  bool Changed = false;
  MachineBasicBlock &MBB = *MI.getParent();
  const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
  const bool IsWave32 = ST.isWave32();
  const unsigned CondReg = TRI->getVCC();
  const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
  const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;

  MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
                                      E = MBB.rend();
  bool ReadsCond = false;
  unsigned Threshold = 5;
  for (++A; A != E; ++A) {
    if (!--Threshold)
      return false;
    if (A->modifiesRegister(ExecReg, TRI))
      return false;
    if (A->modifiesRegister(CondReg, TRI)) {
      if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And)
        return false;
      break;
    }
    ReadsCond |= A->readsRegister(CondReg, TRI);
  }
  if (A == E)
    return false;

  MachineOperand &Op1 = A->getOperand(1);
  MachineOperand &Op2 = A->getOperand(2);
  if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
    TII->commuteInstruction(*A);
    Changed = true;
  }
  if (Op1.getReg() != ExecReg)
    return Changed;
  if (Op2.isImm() && Op2.getImm() != -1)
    return Changed;

  Register SReg;
  if (Op2.isReg()) {
    SReg = Op2.getReg();
    auto M = std::next(A);
    bool ReadsSreg = false;
    for (; M != E; ++M) {
      if (M->definesRegister(SReg, TRI))
        break;
      if (M->modifiesRegister(SReg, TRI))
        return Changed;
      ReadsSreg |= M->readsRegister(SReg, TRI);
    }
    if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
        M->getOperand(1).getImm() != -1)
      return Changed;
    // First if sreg is only used in and instruction fold the immediate
    // into that and.
    if (!ReadsSreg && Op2.isKill()) {
      A->getOperand(2).ChangeToImmediate(-1);
      M->eraseFromParent();
    }
  }

  if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
      MI.killsRegister(CondReg, TRI))
    A->eraseFromParent();

  bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
  if (SReg == ExecReg) {
    if (IsVCCZ) {
      MI.eraseFromParent();
      return true;
    }
    MI.setDesc(TII->get(AMDGPU::S_BRANCH));
  } else {
    MI.setDesc(
        TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
  }

  MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
  MI.addImplicitDefUseOperands(*MBB.getParent());

  return true;
}

bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
                                       MachineInstr &MI) const {
  MachineBasicBlock &MBB = *MI.getParent();
  const MachineFunction &MF = *MBB.getParent();
  const MachineRegisterInfo &MRI = MF.getRegInfo();
  MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
  Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
  SmallVector<MachineInstr *, 4> ToRemove;
  bool IdxOn = true;

  if (!MI.isIdenticalTo(First))
    return false;

  // Scan back to find an identical S_SET_GPR_IDX_ON
  for (MachineBasicBlock::iterator I = std::next(First.getIterator()),
       E = MI.getIterator(); I != E; ++I) {
    switch (I->getOpcode()) {
    case AMDGPU::S_SET_GPR_IDX_MODE:
      return false;
    case AMDGPU::S_SET_GPR_IDX_OFF:
      IdxOn = false;
      ToRemove.push_back(&*I);
      break;
    default:
      if (I->modifiesRegister(AMDGPU::M0, TRI))
        return false;
      if (IdxReg && I->modifiesRegister(IdxReg, TRI))
        return false;
      if (llvm::any_of(I->operands(),
                       [&MRI, this](const MachineOperand &MO) {
                         return MO.isReg() &&
                                TRI->isVectorRegister(MRI, MO.getReg());
                       })) {
        // The only exception allowed here is another indirect V_MOV_B32_e32
        // with the same mode.
        if (!IdxOn || I->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
            !I->hasRegisterImplicitUseOperand(AMDGPU::M0))
          return false;
      }
    }
  }

  MI.eraseFromParent();
  for (MachineInstr *RI : ToRemove)
    RI->eraseFromParent();
  return true;
}

bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
  TII = ST.getInstrInfo();
  TRI = &TII->getRegisterInfo();
  bool Changed = false;

  for (MachineBasicBlock &MBB : MF) {
    MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator();
    if (MBBE != MBB.end()) {
      MachineInstr &MI = *MBBE;
      switch (MI.getOpcode()) {
      case AMDGPU::S_CBRANCH_VCCZ:
      case AMDGPU::S_CBRANCH_VCCNZ:
        Changed |= optimizeVccBranch(MI);
        continue;
      default:
        break;
      }
    }

    if (!ST.hasVGPRIndexMode())
      continue;

    MachineInstr *SetGPRMI = nullptr;
    const unsigned Threshold = 20;
    unsigned Count = 0;
    // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
    // second is not needed. Do expensive checks in the optimizeSetGPR()
    // and limit the distance to 20 instructions for compile time purposes.
    for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBBE; ) {
      MachineInstr &MI = *MBBI;
      ++MBBI;

      if (Count == Threshold)
        SetGPRMI = nullptr;
      else
        ++Count;

      if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
        continue;

      Count = 0;
      if (!SetGPRMI) {
        SetGPRMI = &MI;
        continue;
      }

      if (optimizeSetGPR(*SetGPRMI, MI))
        Changed = true;
      else
        SetGPRMI = &MI;
    }
  }

  return Changed;
}
[AMDGPU] Add SIPreEmitPeephole pass. This pass can handle all the optimization opportunities found just before code emission. Presently it includes the handling of vcc branch optimization that was handled earlier in SIInsertSkips. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D76712 2020-03-25 22:15:16 +08:00			`//===-- SIPreEmitPeephole.cpp ------------------------------------===//`
			`//`
			`// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.`
			`// See https://llvm.org/LICENSE.txt for license information.`
			`// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception`
			`//`
			`//===----------------------------------------------------------------------===//`
			`//`
			`/// \file`
			`/// This pass performs the peephole optimizations before code emission.`
			`///`
			`//===----------------------------------------------------------------------===//`

			`#include "AMDGPU.h"`
			`#include "AMDGPUSubtarget.h"`
			`#include "MCTargetDesc/AMDGPUMCTargetDesc.h"`
			`#include "SIInstrInfo.h"`
			`#include "llvm/CodeGen/MachineFunctionPass.h"`
			`#include "llvm/Support/CommandLine.h"`

			`using namespace llvm;`

			`#define DEBUG_TYPE "si-pre-emit-peephole"`

			`namespace {`

			`class SIPreEmitPeephole : public MachineFunctionPass {`
			`private:`
			`const SIInstrInfo *TII = nullptr;`
			`const SIRegisterInfo *TRI = nullptr;`

			`bool optimizeVccBranch(MachineInstr &MI) const;`
[AMDGPU] Peephole adjacent equivalent S_SET_GPR_IDX_ON Differential Revision: https://reviews.llvm.org/D79907 2020-05-14 04:13:16 +08:00			`bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;`
[AMDGPU] Add SIPreEmitPeephole pass. This pass can handle all the optimization opportunities found just before code emission. Presently it includes the handling of vcc branch optimization that was handled earlier in SIInsertSkips. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D76712 2020-03-25 22:15:16 +08:00
			`public:`
			`static char ID;`

			`SIPreEmitPeephole() : MachineFunctionPass(ID) {`
			`initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry());`
			`}`

			`bool runOnMachineFunction(MachineFunction &MF) override;`
			`};`

			`} // End anonymous namespace.`

			`INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,`
			`"SI peephole optimizations", false, false)`

			`char SIPreEmitPeephole::ID = 0;`

			`char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;`

			`bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {`
			`// Match:`
			`// sreg = -1`
			`// vcc = S_AND_B64 exec, sreg`
			`// S_CBRANCH_VCC[N]Z`
			`// =>`
			`// S_CBRANCH_EXEC[N]Z`
			`// We end up with this pattern sometimes after basic block placement.`
			`// It happens while combining a block which assigns -1 to a saved mask and`
			`// another block which consumes that saved mask and then a branch.`
			`bool Changed = false;`
			`MachineBasicBlock &MBB = *MI.getParent();`
			`const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();`
			`const bool IsWave32 = ST.isWave32();`
			`const unsigned CondReg = TRI->getVCC();`
			`const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;`
			`const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;`

			`MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),`
			`E = MBB.rend();`
			`bool ReadsCond = false;`
			`unsigned Threshold = 5;`
			`for (++A; A != E; ++A) {`
			`if (!--Threshold)`
			`return false;`
			`if (A->modifiesRegister(ExecReg, TRI))`
			`return false;`
			`if (A->modifiesRegister(CondReg, TRI)) {`
			`if (!A->definesRegister(CondReg, TRI) \|\| A->getOpcode() != And)`
			`return false;`
			`break;`
			`}`
			`ReadsCond \|= A->readsRegister(CondReg, TRI);`
			`}`
			`if (A == E)`
			`return false;`

			`MachineOperand &Op1 = A->getOperand(1);`
			`MachineOperand &Op2 = A->getOperand(2);`
			`if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {`
			`TII->commuteInstruction(*A);`
			`Changed = true;`
			`}`
			`if (Op1.getReg() != ExecReg)`
			`return Changed;`
			`if (Op2.isImm() && Op2.getImm() != -1)`
			`return Changed;`

			`Register SReg;`
			`if (Op2.isReg()) {`
			`SReg = Op2.getReg();`
			`auto M = std::next(A);`
			`bool ReadsSreg = false;`
			`for (; M != E; ++M) {`
			`if (M->definesRegister(SReg, TRI))`
			`break;`
			`if (M->modifiesRegister(SReg, TRI))`
			`return Changed;`
			`ReadsSreg \|= M->readsRegister(SReg, TRI);`
			`}`
			`if (M == E \|\| !M->isMoveImmediate() \|\| !M->getOperand(1).isImm() \|\|`
			`M->getOperand(1).getImm() != -1)`
			`return Changed;`
			`// First if sreg is only used in and instruction fold the immediate`
			`// into that and.`
			`if (!ReadsSreg && Op2.isKill()) {`
			`A->getOperand(2).ChangeToImmediate(-1);`
			`M->eraseFromParent();`
			`}`
			`}`

			`if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&`
			`MI.killsRegister(CondReg, TRI))`
			`A->eraseFromParent();`

			`bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;`
			`if (SReg == ExecReg) {`
			`if (IsVCCZ) {`
			`MI.eraseFromParent();`
			`return true;`
			`}`
			`MI.setDesc(TII->get(AMDGPU::S_BRANCH));`
			`} else {`
			`MI.setDesc(`
			`TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));`
			`}`

			`MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /Kill/, TRI));`
			`MI.addImplicitDefUseOperands(*MBB.getParent());`

			`return true;`
			`}`

[AMDGPU] Peephole adjacent equivalent S_SET_GPR_IDX_ON Differential Revision: https://reviews.llvm.org/D79907 2020-05-14 04:13:16 +08:00			`bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,`
			`MachineInstr &MI) const {`
			`MachineBasicBlock &MBB = *MI.getParent();`
			`const MachineFunction &MF = *MBB.getParent();`
			`const MachineRegisterInfo &MRI = MF.getRegInfo();`
			`MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);`
			`Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();`
			`SmallVector<MachineInstr *, 4> ToRemove;`
			`bool IdxOn = true;`

			`if (!MI.isIdenticalTo(First))`
			`return false;`

			`// Scan back to find an identical S_SET_GPR_IDX_ON`
			`for (MachineBasicBlock::iterator I = std::next(First.getIterator()),`
			`E = MI.getIterator(); I != E; ++I) {`
			`switch (I->getOpcode()) {`
			`case AMDGPU::S_SET_GPR_IDX_MODE:`
			`return false;`
			`case AMDGPU::S_SET_GPR_IDX_OFF:`
			`IdxOn = false;`
			`ToRemove.push_back(&*I);`
			`break;`
			`default:`
			`if (I->modifiesRegister(AMDGPU::M0, TRI))`
			`return false;`
			`if (IdxReg && I->modifiesRegister(IdxReg, TRI))`
			`return false;`
			`if (llvm::any_of(I->operands(),`
			`[&MRI, this](const MachineOperand &MO) {`
			`return MO.isReg() &&`
			`TRI->isVectorRegister(MRI, MO.getReg());`
			`})) {`
			`// The only exception allowed here is another indirect V_MOV_B32_e32`
			`// with the same mode.`
			`if (!IdxOn \|\| I->getOpcode() != AMDGPU::V_MOV_B32_e32 \|\|`
			`!I->hasRegisterImplicitUseOperand(AMDGPU::M0))`
			`return false;`
			`}`
			`}`
			`}`

			`MI.eraseFromParent();`
			`for (MachineInstr *RI : ToRemove)`
			`RI->eraseFromParent();`
			`return true;`
			`}`

[AMDGPU] Add SIPreEmitPeephole pass. This pass can handle all the optimization opportunities found just before code emission. Presently it includes the handling of vcc branch optimization that was handled earlier in SIInsertSkips. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D76712 2020-03-25 22:15:16 +08:00			`bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {`
			`const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();`
			`TII = ST.getInstrInfo();`
			`TRI = &TII->getRegisterInfo();`
			`bool Changed = false;`

			`for (MachineBasicBlock &MBB : MF) {`
[AMDGPU] Peephole adjacent equivalent S_SET_GPR_IDX_ON Differential Revision: https://reviews.llvm.org/D79907 2020-05-14 04:13:16 +08:00			`MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator();`
			`if (MBBE != MBB.end()) {`
			`MachineInstr &MI = *MBBE;`
			`switch (MI.getOpcode()) {`
			`case AMDGPU::S_CBRANCH_VCCZ:`
			`case AMDGPU::S_CBRANCH_VCCNZ:`
			`Changed \|= optimizeVccBranch(MI);`
			`continue;`
			`default:`
			`break;`
			`}`
			`}`

			`if (!ST.hasVGPRIndexMode())`
[AMDGPU] Add SIPreEmitPeephole pass. This pass can handle all the optimization opportunities found just before code emission. Presently it includes the handling of vcc branch optimization that was handled earlier in SIInsertSkips. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D76712 2020-03-25 22:15:16 +08:00			`continue;`

[AMDGPU] Peephole adjacent equivalent S_SET_GPR_IDX_ON Differential Revision: https://reviews.llvm.org/D79907 2020-05-14 04:13:16 +08:00			`MachineInstr *SetGPRMI = nullptr;`
			`const unsigned Threshold = 20;`
			`unsigned Count = 0;`
			`// Scan the block for two S_SET_GPR_IDX_ON instructions to see if a`
			`// second is not needed. Do expensive checks in the optimizeSetGPR()`
			`// and limit the distance to 20 instructions for compile time purposes.`
			`for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBBE; ) {`
			`MachineInstr &MI = *MBBI;`
			`++MBBI;`

			`if (Count == Threshold)`
			`SetGPRMI = nullptr;`
			`else`
			`++Count;`

			`if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)`
			`continue;`

			`Count = 0;`
			`if (!SetGPRMI) {`
			`SetGPRMI = &MI;`
			`continue;`
			`}`

			`if (optimizeSetGPR(*SetGPRMI, MI))`
			`Changed = true;`
			`else`
			`SetGPRMI = &MI;`
[AMDGPU] Add SIPreEmitPeephole pass. This pass can handle all the optimization opportunities found just before code emission. Presently it includes the handling of vcc branch optimization that was handled earlier in SIInsertSkips. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D76712 2020-03-25 22:15:16 +08:00			`}`
			`}`

			`return Changed;`
			`}`