2014-11-22 06:06:37 +08:00
|
|
|
//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
|
|
|
|
//
|
2019-01-19 16:50:56 +08:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2014-11-22 06:06:37 +08:00
|
|
|
//
|
|
|
|
/// \file
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
|
|
|
|
#include "AMDGPU.h"
|
|
|
|
#include "AMDGPUSubtarget.h"
|
|
|
|
#include "SIInstrInfo.h"
|
2017-02-28 03:35:42 +08:00
|
|
|
#include "SIMachineFunctionInfo.h"
|
AMDGPU: Remove #include "MCTargetDesc/AMDGPUMCTargetDesc.h" from common headers
Summary:
MCTargetDesc/AMDGPUMCTargetDesc.h contains enums for all the instuction
and register defintions, which are huge so we only want to include
them where needed.
This will also make it easier if we want to split the R600 and GCN
definitions into separate tablegenerated files.
I was unable to remove AMDGPUMCTargetDesc.h from SIMachineFunctionInfo.h
because it uses some enums from the header to initialize default values
for the SIMachineFunction class, so I ended up having to remove includes of
SIMachineFunctionInfo.h from headers too.
Reviewers: arsenm, nhaehnle
Reviewed By: nhaehnle
Subscribers: MatzeB, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D46272
llvm-svn: 332930
2018-05-22 10:03:23 +08:00
|
|
|
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
2017-06-21 02:56:32 +08:00
|
|
|
#include "llvm/ADT/DepthFirstIterator.h"
|
2019-10-25 03:36:24 +08:00
|
|
|
#include "llvm/ADT/SetVector.h"
|
2014-11-22 06:06:37 +08:00
|
|
|
#include "llvm/CodeGen/MachineFunctionPass.h"
|
|
|
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
|
|
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
|
|
|
#include "llvm/Support/Debug.h"
|
2015-03-24 03:32:43 +08:00
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2014-11-22 06:06:37 +08:00
|
|
|
#include "llvm/Target/TargetMachine.h"
|
|
|
|
|
|
|
|
#define DEBUG_TYPE "si-fold-operands"
|
|
|
|
using namespace llvm;
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
2015-01-08 01:42:16 +08:00
|
|
|
struct FoldCandidate {
|
|
|
|
MachineInstr *UseMI;
|
2016-09-14 23:51:33 +08:00
|
|
|
union {
|
|
|
|
MachineOperand *OpToFold;
|
|
|
|
uint64_t ImmToFold;
|
|
|
|
int FrameIndexToFold;
|
|
|
|
};
|
2018-08-29 02:34:24 +08:00
|
|
|
int ShrinkOpcode;
|
2020-07-23 07:22:34 +08:00
|
|
|
unsigned UseOpNo;
|
2016-09-14 23:51:33 +08:00
|
|
|
MachineOperand::MachineOperandType Kind;
|
2017-06-03 08:41:52 +08:00
|
|
|
bool Commuted;
|
2015-01-08 01:42:16 +08:00
|
|
|
|
2017-06-03 08:41:52 +08:00
|
|
|
FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
|
2018-08-29 02:34:24 +08:00
|
|
|
bool Commuted_ = false,
|
|
|
|
int ShrinkOp = -1) :
|
|
|
|
UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
|
|
|
|
Kind(FoldOp->getType()),
|
2017-06-03 08:41:52 +08:00
|
|
|
Commuted(Commuted_) {
|
2015-01-08 06:44:19 +08:00
|
|
|
if (FoldOp->isImm()) {
|
|
|
|
ImmToFold = FoldOp->getImm();
|
2016-09-14 23:51:33 +08:00
|
|
|
} else if (FoldOp->isFI()) {
|
|
|
|
FrameIndexToFold = FoldOp->getIndex();
|
2015-01-08 06:44:19 +08:00
|
|
|
} else {
|
AMDGPU: Write LDS objects out as global symbols in code generation
Summary:
The symbols use the processor-specific SHN_AMDGPU_LDS section index
introduced with a previous change. The linker is then expected to resolve
relocations, which are also emitted.
Initially disabled for HSA and PAL environments until they have caught up
in terms of linker and runtime loader.
Some notes:
- The llvm.amdgcn.groupstaticsize intrinsics can no longer be lowered
to a constant at compile times, which means some tests can no longer
be applied.
The current "solution" is a terrible hack, but the intrinsic isn't
used by Mesa, so we can keep it for now.
- We no longer know the full LDS size per kernel at compile time, which
means that we can no longer generate a relevant error message at
compile time. It would be possible to add a check for the size of
individual variables, but ultimately the linker will have to perform
the final check.
Change-Id: If66dbf33fccfbf3609aefefa2558ac0850d42275
Reviewers: arsenm, rampitec, t-tye, b-sumner, jsjodin
Subscribers: qcolombet, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61494
llvm-svn: 364297
2019-06-25 19:52:30 +08:00
|
|
|
assert(FoldOp->isReg() || FoldOp->isGlobal());
|
2015-01-08 06:44:19 +08:00
|
|
|
OpToFold = FoldOp;
|
|
|
|
}
|
|
|
|
}
|
2015-01-08 01:42:16 +08:00
|
|
|
|
2016-09-14 23:51:33 +08:00
|
|
|
bool isFI() const {
|
|
|
|
return Kind == MachineOperand::MO_FrameIndex;
|
|
|
|
}
|
|
|
|
|
2015-01-08 01:42:16 +08:00
|
|
|
bool isImm() const {
|
2016-09-14 23:51:33 +08:00
|
|
|
return Kind == MachineOperand::MO_Immediate;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool isReg() const {
|
|
|
|
return Kind == MachineOperand::MO_Register;
|
2015-01-08 01:42:16 +08:00
|
|
|
}
|
2017-06-03 08:41:52 +08:00
|
|
|
|
AMDGPU: Write LDS objects out as global symbols in code generation
Summary:
The symbols use the processor-specific SHN_AMDGPU_LDS section index
introduced with a previous change. The linker is then expected to resolve
relocations, which are also emitted.
Initially disabled for HSA and PAL environments until they have caught up
in terms of linker and runtime loader.
Some notes:
- The llvm.amdgcn.groupstaticsize intrinsics can no longer be lowered
to a constant at compile times, which means some tests can no longer
be applied.
The current "solution" is a terrible hack, but the intrinsic isn't
used by Mesa, so we can keep it for now.
- We no longer know the full LDS size per kernel at compile time, which
means that we can no longer generate a relevant error message at
compile time. It would be possible to add a check for the size of
individual variables, but ultimately the linker will have to perform
the final check.
Change-Id: If66dbf33fccfbf3609aefefa2558ac0850d42275
Reviewers: arsenm, rampitec, t-tye, b-sumner, jsjodin
Subscribers: qcolombet, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61494
llvm-svn: 364297
2019-06-25 19:52:30 +08:00
|
|
|
bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
|
|
|
|
|
2017-06-03 08:41:52 +08:00
|
|
|
bool isCommuted() const {
|
|
|
|
return Commuted;
|
|
|
|
}
|
2018-08-29 02:34:24 +08:00
|
|
|
|
|
|
|
bool needsShrink() const {
|
|
|
|
return ShrinkOpcode != -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
int getShrinkOpcode() const {
|
|
|
|
return ShrinkOpcode;
|
|
|
|
}
|
2015-01-08 01:42:16 +08:00
|
|
|
};
|
|
|
|
|
2017-01-11 07:32:04 +08:00
|
|
|
class SIFoldOperands : public MachineFunctionPass {
|
|
|
|
public:
|
|
|
|
static char ID;
|
|
|
|
MachineRegisterInfo *MRI;
|
|
|
|
const SIInstrInfo *TII;
|
|
|
|
const SIRegisterInfo *TRI;
|
2018-07-12 04:59:01 +08:00
|
|
|
const GCNSubtarget *ST;
|
2019-06-24 22:53:56 +08:00
|
|
|
const SIMachineFunctionInfo *MFI;
|
2017-01-11 07:32:04 +08:00
|
|
|
|
|
|
|
void foldOperand(MachineOperand &OpToFold,
|
|
|
|
MachineInstr *UseMI,
|
2019-06-24 22:53:56 +08:00
|
|
|
int UseOpIdx,
|
2017-01-11 07:32:04 +08:00
|
|
|
SmallVectorImpl<FoldCandidate> &FoldList,
|
|
|
|
SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
|
|
|
|
|
|
|
|
void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
|
|
|
|
|
2017-02-23 07:27:53 +08:00
|
|
|
const MachineOperand *isClamp(const MachineInstr &MI) const;
|
|
|
|
bool tryFoldClamp(MachineInstr &MI);
|
|
|
|
|
2017-02-28 03:35:42 +08:00
|
|
|
std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
|
|
|
|
bool tryFoldOMod(MachineInstr &MI);
|
|
|
|
|
2017-01-11 07:32:04 +08:00
|
|
|
public:
|
|
|
|
SIFoldOperands() : MachineFunctionPass(ID) {
|
|
|
|
initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
|
|
|
|
}
|
|
|
|
|
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override;
|
|
|
|
|
|
|
|
StringRef getPassName() const override { return "SI Fold Operands"; }
|
|
|
|
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
|
|
AU.setPreservesCFG();
|
|
|
|
MachineFunctionPass::getAnalysisUsage(AU);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2014-11-22 06:06:37 +08:00
|
|
|
} // End anonymous namespace.
|
|
|
|
|
2016-02-11 14:15:34 +08:00
|
|
|
INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
|
|
|
|
"SI Fold Operands", false, false)
|
2014-11-22 06:06:37 +08:00
|
|
|
|
|
|
|
char SIFoldOperands::ID = 0;
|
|
|
|
|
|
|
|
char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
|
|
|
|
|
2017-01-12 06:00:02 +08:00
|
|
|
// Wrapper around isInlineConstant that understands special cases when
|
|
|
|
// instruction types are replaced during operand folding.
|
|
|
|
static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
|
|
|
|
const MachineInstr &UseMI,
|
|
|
|
unsigned OpNo,
|
|
|
|
const MachineOperand &OpToFold) {
|
|
|
|
if (TII->isInlineConstant(UseMI, OpNo, OpToFold))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
unsigned Opc = UseMI.getOpcode();
|
|
|
|
switch (Opc) {
|
|
|
|
case AMDGPU::V_MAC_F32_e64:
|
2018-05-01 03:08:16 +08:00
|
|
|
case AMDGPU::V_MAC_F16_e64:
|
2019-09-26 02:40:20 +08:00
|
|
|
case AMDGPU::V_FMAC_F32_e64:
|
|
|
|
case AMDGPU::V_FMAC_F16_e64: {
|
2017-01-12 06:00:02 +08:00
|
|
|
// Special case for mac. Since this is replaced with mad when folded into
|
|
|
|
// src2, we need to check the legality for the final instruction.
|
|
|
|
int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
|
|
|
|
if (static_cast<int>(OpNo) == Src2Idx) {
|
2019-09-26 02:40:20 +08:00
|
|
|
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 ||
|
|
|
|
Opc == AMDGPU::V_FMAC_F16_e64;
|
|
|
|
bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 ||
|
|
|
|
Opc == AMDGPU::V_FMAC_F32_e64;
|
2018-05-01 03:08:16 +08:00
|
|
|
|
|
|
|
unsigned Opc = IsFMA ?
|
2019-09-26 02:40:20 +08:00
|
|
|
(IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) :
|
|
|
|
(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
|
2018-05-01 03:08:16 +08:00
|
|
|
const MCInstrDesc &MadDesc = TII->get(Opc);
|
2017-01-12 06:00:02 +08:00
|
|
|
return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
|
|
|
|
}
|
2017-07-07 18:18:57 +08:00
|
|
|
return false;
|
2017-01-12 06:00:02 +08:00
|
|
|
}
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-06-24 22:53:56 +08:00
|
|
|
// TODO: Add heuristic that the frame index might not fit in the addressing mode
|
|
|
|
// immediate offset to avoid materializing in loops.
|
|
|
|
static bool frameIndexMayFold(const SIInstrInfo *TII,
|
|
|
|
const MachineInstr &UseMI,
|
|
|
|
int OpNo,
|
|
|
|
const MachineOperand &OpToFold) {
|
|
|
|
return OpToFold.isFI() &&
|
2020-10-22 05:27:03 +08:00
|
|
|
TII->isMUBUF(UseMI) &&
|
2019-06-24 22:53:56 +08:00
|
|
|
OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::vaddr);
|
|
|
|
}
|
|
|
|
|
2014-11-22 06:06:37 +08:00
|
|
|
FunctionPass *llvm::createSIFoldOperandsPass() {
|
|
|
|
return new SIFoldOperands();
|
|
|
|
}
|
|
|
|
|
2015-01-08 01:42:16 +08:00
|
|
|
static bool updateOperand(FoldCandidate &Fold,
|
2018-08-29 02:34:24 +08:00
|
|
|
const SIInstrInfo &TII,
|
2019-05-02 12:01:39 +08:00
|
|
|
const TargetRegisterInfo &TRI,
|
|
|
|
const GCNSubtarget &ST) {
|
2015-01-08 01:42:16 +08:00
|
|
|
MachineInstr *MI = Fold.UseMI;
|
|
|
|
MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
|
2014-11-22 06:06:37 +08:00
|
|
|
assert(Old.isReg());
|
|
|
|
|
2015-01-08 01:42:16 +08:00
|
|
|
if (Fold.isImm()) {
|
2019-05-02 12:01:39 +08:00
|
|
|
if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked &&
|
2019-07-12 05:19:33 +08:00
|
|
|
!(MI->getDesc().TSFlags & SIInstrFlags::IsMAI) &&
|
2020-09-05 03:44:01 +08:00
|
|
|
AMDGPU::isFoldableLiteralV216(Fold.ImmToFold,
|
|
|
|
ST.hasInv2PiInlineImm())) {
|
2018-04-20 05:16:50 +08:00
|
|
|
// Set op_sel/op_sel_hi on this operand or bail out if op_sel is
|
|
|
|
// already set.
|
2018-04-18 07:09:05 +08:00
|
|
|
unsigned Opcode = MI->getOpcode();
|
|
|
|
int OpNo = MI->getOperandNo(&Old);
|
|
|
|
int ModIdx = -1;
|
|
|
|
if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
|
|
|
|
ModIdx = AMDGPU::OpName::src0_modifiers;
|
|
|
|
else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
|
|
|
|
ModIdx = AMDGPU::OpName::src1_modifiers;
|
|
|
|
else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
|
|
|
|
ModIdx = AMDGPU::OpName::src2_modifiers;
|
|
|
|
assert(ModIdx != -1);
|
|
|
|
ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
|
|
|
|
MachineOperand &Mod = MI->getOperand(ModIdx);
|
|
|
|
unsigned Val = Mod.getImm();
|
2020-09-05 03:44:01 +08:00
|
|
|
if (!(Val & SISrcMods::OP_SEL_0) && (Val & SISrcMods::OP_SEL_1)) {
|
|
|
|
// Only apply the following transformation if that operand requries
|
|
|
|
// a packed immediate.
|
|
|
|
switch (TII.get(Opcode).OpInfo[OpNo].OperandType) {
|
|
|
|
case AMDGPU::OPERAND_REG_IMM_V2FP16:
|
|
|
|
case AMDGPU::OPERAND_REG_IMM_V2INT16:
|
|
|
|
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
|
|
|
|
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
|
|
|
|
// If upper part is all zero we do not need op_sel_hi.
|
|
|
|
if (!isUInt<16>(Fold.ImmToFold)) {
|
|
|
|
if (!(Fold.ImmToFold & 0xffff)) {
|
|
|
|
Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
|
|
|
|
Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
|
|
|
|
Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
|
|
|
|
return true;
|
|
|
|
}
|
[AMDGPU] Fix an issue in `op_sel_hi` skipping.
Summary:
- Only apply packed literal `op_sel_hi` skipping on operands requiring
packed literals. Even an instruction is `packed`, it may have operand
requiring non-packed literal, such as `v_dot2_f32_f16`.
Reviewers: rampitec, arsenm, kzhuravl
Subscribers: jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D60978
llvm-svn: 358922
2019-04-23 06:05:49 +08:00
|
|
|
Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
|
2020-09-05 03:44:01 +08:00
|
|
|
Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
|
[AMDGPU] Fix an issue in `op_sel_hi` skipping.
Summary:
- Only apply packed literal `op_sel_hi` skipping on operands requiring
packed literals. Even an instruction is `packed`, it may have operand
requiring non-packed literal, such as `v_dot2_f32_f16`.
Reviewers: rampitec, arsenm, kzhuravl
Subscribers: jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D60978
llvm-svn: 358922
2019-04-23 06:05:49 +08:00
|
|
|
return true;
|
|
|
|
}
|
2020-09-05 03:44:01 +08:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
2018-04-20 05:16:50 +08:00
|
|
|
}
|
|
|
|
}
|
2018-04-18 07:09:05 +08:00
|
|
|
}
|
2019-05-03 23:21:53 +08:00
|
|
|
}
|
2018-08-29 02:34:24 +08:00
|
|
|
|
AMDGPU: Write LDS objects out as global symbols in code generation
Summary:
The symbols use the processor-specific SHN_AMDGPU_LDS section index
introduced with a previous change. The linker is then expected to resolve
relocations, which are also emitted.
Initially disabled for HSA and PAL environments until they have caught up
in terms of linker and runtime loader.
Some notes:
- The llvm.amdgcn.groupstaticsize intrinsics can no longer be lowered
to a constant at compile times, which means some tests can no longer
be applied.
The current "solution" is a terrible hack, but the intrinsic isn't
used by Mesa, so we can keep it for now.
- We no longer know the full LDS size per kernel at compile time, which
means that we can no longer generate a relevant error message at
compile time. It would be possible to add a check for the size of
individual variables, but ultimately the linker will have to perform
the final check.
Change-Id: If66dbf33fccfbf3609aefefa2558ac0850d42275
Reviewers: arsenm, rampitec, t-tye, b-sumner, jsjodin
Subscribers: qcolombet, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61494
llvm-svn: 364297
2019-06-25 19:52:30 +08:00
|
|
|
if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
|
2019-05-03 23:21:53 +08:00
|
|
|
MachineBasicBlock *MBB = MI->getParent();
|
2019-10-21 01:44:17 +08:00
|
|
|
auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI, 16);
|
|
|
|
if (Liveness != MachineBasicBlock::LQR_Dead) {
|
|
|
|
LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
|
2019-05-03 23:21:53 +08:00
|
|
|
return false;
|
2019-10-21 01:44:17 +08:00
|
|
|
}
|
2018-08-29 02:34:24 +08:00
|
|
|
|
2019-05-03 23:21:53 +08:00
|
|
|
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
|
|
|
|
int Op32 = Fold.getShrinkOpcode();
|
|
|
|
MachineOperand &Dst0 = MI->getOperand(0);
|
|
|
|
MachineOperand &Dst1 = MI->getOperand(1);
|
|
|
|
assert(Dst0.isDef() && Dst1.isDef());
|
2018-08-29 02:44:16 +08:00
|
|
|
|
2019-05-03 23:21:53 +08:00
|
|
|
bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg());
|
2018-08-29 02:34:24 +08:00
|
|
|
|
2019-05-03 23:21:53 +08:00
|
|
|
const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg());
|
Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Partial reverts in:
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
X86FixupLEAs.cpp - Some functions return unsigned and arguably should be MCRegister
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
HexagonBitSimplify.cpp - Function takes BitTracker::RegisterRef which appears to be unsigned&
MachineVerifier.cpp - Ambiguous operator==() given MCRegister and const Register
PPCFastISel.cpp - No Register::operator-=()
PeepholeOptimizer.cpp - TargetInstrInfo::optimizeLoadInstr() takes an unsigned&
MachineTraceMetrics.cpp - MachineTraceMetrics lacks a suitable constructor
Manual fixups in:
ARMFastISel.cpp - ARMEmitLoad() now takes a Register& instead of unsigned&
HexagonSplitDouble.cpp - Ternary operator was ambiguous between unsigned/Register
HexagonConstExtenders.cpp - Has a local class named Register, used llvm::Register instead of Register.
PPCFastISel.cpp - PPCEmitLoad() now takes a Register& instead of unsigned&
Depends on D65919
Reviewers: arsenm, bogner, craig.topper, RKSimon
Reviewed By: arsenm
Subscribers: RKSimon, craig.topper, lenary, aemerson, wuzish, jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65962
llvm-svn: 369041
2019-08-16 03:22:08 +08:00
|
|
|
Register NewReg0 = MRI.createVirtualRegister(Dst0RC);
|
2018-08-29 02:34:24 +08:00
|
|
|
|
2019-05-03 23:21:53 +08:00
|
|
|
MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32);
|
2018-08-29 02:44:16 +08:00
|
|
|
|
2019-05-03 23:21:53 +08:00
|
|
|
if (HaveNonDbgCarryUse) {
|
|
|
|
BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg())
|
|
|
|
.addReg(AMDGPU::VCC, RegState::Kill);
|
2018-08-29 02:34:24 +08:00
|
|
|
}
|
|
|
|
|
2019-05-03 23:21:53 +08:00
|
|
|
// Keep the old instruction around to avoid breaking iterators, but
|
|
|
|
// replace it with a dummy instruction to remove uses.
|
|
|
|
//
|
|
|
|
// FIXME: We should not invert how this pass looks at operands to avoid
|
|
|
|
// this. Should track set of foldable movs instead of looking for uses
|
|
|
|
// when looking at a use.
|
|
|
|
Dst0.setReg(NewReg0);
|
|
|
|
for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
|
|
|
|
MI->RemoveOperand(I);
|
|
|
|
MI->setDesc(TII.get(AMDGPU::IMPLICIT_DEF));
|
|
|
|
|
|
|
|
if (Fold.isCommuted())
|
|
|
|
TII.commuteInstruction(*Inst32, false);
|
2014-11-22 06:06:37 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2018-08-29 02:34:24 +08:00
|
|
|
assert(!Fold.needsShrink() && "not handled");
|
|
|
|
|
2019-05-03 23:21:53 +08:00
|
|
|
if (Fold.isImm()) {
|
|
|
|
Old.ChangeToImmediate(Fold.ImmToFold);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
AMDGPU: Write LDS objects out as global symbols in code generation
Summary:
The symbols use the processor-specific SHN_AMDGPU_LDS section index
introduced with a previous change. The linker is then expected to resolve
relocations, which are also emitted.
Initially disabled for HSA and PAL environments until they have caught up
in terms of linker and runtime loader.
Some notes:
- The llvm.amdgcn.groupstaticsize intrinsics can no longer be lowered
to a constant at compile times, which means some tests can no longer
be applied.
The current "solution" is a terrible hack, but the intrinsic isn't
used by Mesa, so we can keep it for now.
- We no longer know the full LDS size per kernel at compile time, which
means that we can no longer generate a relevant error message at
compile time. It would be possible to add a check for the size of
individual variables, but ultimately the linker will have to perform
the final check.
Change-Id: If66dbf33fccfbf3609aefefa2558ac0850d42275
Reviewers: arsenm, rampitec, t-tye, b-sumner, jsjodin
Subscribers: qcolombet, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61494
llvm-svn: 364297
2019-06-25 19:52:30 +08:00
|
|
|
if (Fold.isGlobal()) {
|
|
|
|
Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
|
|
|
|
Fold.OpToFold->getTargetFlags());
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-09-14 23:51:33 +08:00
|
|
|
if (Fold.isFI()) {
|
|
|
|
Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-01-08 01:42:16 +08:00
|
|
|
MachineOperand *New = Fold.OpToFold;
|
2019-06-18 20:23:45 +08:00
|
|
|
Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
|
|
|
|
Old.setIsUndef(New->isUndef());
|
|
|
|
return true;
|
2014-11-22 06:06:37 +08:00
|
|
|
}
|
|
|
|
|
2017-01-11 07:32:04 +08:00
|
|
|
static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
|
2015-07-13 23:47:57 +08:00
|
|
|
const MachineInstr *MI) {
|
|
|
|
for (auto Candidate : FoldList) {
|
|
|
|
if (Candidate.UseMI == MI)
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
[AMDGPU] Skip additional folding on the same operand.
Reviewers: rampitec, arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69355
2019-10-24 03:19:06 +08:00
|
|
|
static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
|
|
|
|
MachineInstr *MI, unsigned OpNo,
|
|
|
|
MachineOperand *FoldOp, bool Commuted = false,
|
|
|
|
int ShrinkOp = -1) {
|
|
|
|
// Skip additional folding on the same operand.
|
|
|
|
for (FoldCandidate &Fold : FoldList)
|
|
|
|
if (Fold.UseMI == MI && Fold.UseOpNo == OpNo)
|
|
|
|
return;
|
|
|
|
LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal")
|
|
|
|
<< " operand " << OpNo << "\n " << *MI << '\n');
|
|
|
|
FoldList.push_back(FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
|
|
|
|
}
|
|
|
|
|
2017-01-11 07:32:04 +08:00
|
|
|
static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
|
2015-01-08 06:44:19 +08:00
|
|
|
MachineInstr *MI, unsigned OpNo,
|
|
|
|
MachineOperand *OpToFold,
|
|
|
|
const SIInstrInfo *TII) {
|
2016-06-30 08:01:54 +08:00
|
|
|
if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
|
2016-11-13 15:01:11 +08:00
|
|
|
// Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
|
2015-07-13 23:47:57 +08:00
|
|
|
unsigned Opc = MI->getOpcode();
|
2018-05-01 03:08:16 +08:00
|
|
|
if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
|
2019-09-26 02:40:20 +08:00
|
|
|
Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
|
2015-07-13 23:47:57 +08:00
|
|
|
(int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
|
2019-09-26 02:40:20 +08:00
|
|
|
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 ||
|
|
|
|
Opc == AMDGPU::V_FMAC_F16_e64;
|
|
|
|
bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 ||
|
|
|
|
Opc == AMDGPU::V_FMAC_F32_e64;
|
2018-05-01 03:08:16 +08:00
|
|
|
unsigned NewOpc = IsFMA ?
|
2019-09-26 02:40:20 +08:00
|
|
|
(IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) :
|
|
|
|
(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
|
2016-11-13 15:01:11 +08:00
|
|
|
|
|
|
|
// Check if changing this to a v_mad_{f16, f32} instruction will allow us
|
|
|
|
// to fold the operand.
|
2018-05-01 03:08:16 +08:00
|
|
|
MI->setDesc(TII->get(NewOpc));
|
2015-07-13 23:47:57 +08:00
|
|
|
bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
|
|
|
|
if (FoldAsMAD) {
|
|
|
|
MI->untieRegOperand(OpNo);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
MI->setDesc(TII->get(Opc));
|
|
|
|
}
|
|
|
|
|
2016-12-07 10:42:15 +08:00
|
|
|
// Special case for s_setreg_b32
|
2020-09-10 00:21:36 +08:00
|
|
|
if (OpToFold->isImm()) {
|
|
|
|
unsigned ImmOpc = 0;
|
|
|
|
if (Opc == AMDGPU::S_SETREG_B32)
|
|
|
|
ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
|
|
|
|
else if (Opc == AMDGPU::S_SETREG_B32_mode)
|
|
|
|
ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
|
|
|
|
if (ImmOpc) {
|
|
|
|
MI->setDesc(TII->get(ImmOpc));
|
|
|
|
appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
|
|
|
|
return true;
|
|
|
|
}
|
2016-12-07 10:42:15 +08:00
|
|
|
}
|
|
|
|
|
2015-07-13 23:47:57 +08:00
|
|
|
// If we are already folding into another operand of MI, then
|
|
|
|
// we can't commute the instruction, otherwise we risk making the
|
|
|
|
// other fold illegal.
|
|
|
|
if (isUseMIInFoldList(FoldList, MI))
|
|
|
|
return false;
|
|
|
|
|
2018-08-29 02:34:24 +08:00
|
|
|
unsigned CommuteOpNo = OpNo;
|
|
|
|
|
2015-01-08 06:44:19 +08:00
|
|
|
// Operand is not legal, so try to commute the instruction to
|
|
|
|
// see if this makes it possible to fold.
|
2015-09-29 04:33:22 +08:00
|
|
|
unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
|
|
|
|
unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
|
2016-06-30 08:01:54 +08:00
|
|
|
bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1);
|
2015-01-08 06:44:19 +08:00
|
|
|
|
|
|
|
if (CanCommute) {
|
|
|
|
if (CommuteIdx0 == OpNo)
|
2018-08-29 02:34:24 +08:00
|
|
|
CommuteOpNo = CommuteIdx1;
|
2015-01-08 06:44:19 +08:00
|
|
|
else if (CommuteIdx1 == OpNo)
|
2018-08-29 02:34:24 +08:00
|
|
|
CommuteOpNo = CommuteIdx0;
|
2015-01-08 06:44:19 +08:00
|
|
|
}
|
|
|
|
|
2018-08-29 02:34:24 +08:00
|
|
|
|
2015-09-29 04:33:22 +08:00
|
|
|
// One of operands might be an Imm operand, and OpNo may refer to it after
|
|
|
|
// the call of commuteInstruction() below. Such situations are avoided
|
|
|
|
// here explicitly as OpNo must be a register operand to be a candidate
|
|
|
|
// for memory folding.
|
|
|
|
if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
|
|
|
|
!MI->getOperand(CommuteIdx1).isReg()))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (!CanCommute ||
|
2016-06-30 08:01:54 +08:00
|
|
|
!TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
|
2015-01-08 06:44:19 +08:00
|
|
|
return false;
|
|
|
|
|
2018-08-29 02:34:24 +08:00
|
|
|
if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
|
2020-07-14 21:18:36 +08:00
|
|
|
if ((Opc == AMDGPU::V_ADD_CO_U32_e64 ||
|
|
|
|
Opc == AMDGPU::V_SUB_CO_U32_e64 ||
|
|
|
|
Opc == AMDGPU::V_SUBREV_CO_U32_e64) && // FIXME
|
AMDGPU: Write LDS objects out as global symbols in code generation
Summary:
The symbols use the processor-specific SHN_AMDGPU_LDS section index
introduced with a previous change. The linker is then expected to resolve
relocations, which are also emitted.
Initially disabled for HSA and PAL environments until they have caught up
in terms of linker and runtime loader.
Some notes:
- The llvm.amdgcn.groupstaticsize intrinsics can no longer be lowered
to a constant at compile times, which means some tests can no longer
be applied.
The current "solution" is a terrible hack, but the intrinsic isn't
used by Mesa, so we can keep it for now.
- We no longer know the full LDS size per kernel at compile time, which
means that we can no longer generate a relevant error message at
compile time. It would be possible to add a check for the size of
individual variables, but ultimately the linker will have to perform
the final check.
Change-Id: If66dbf33fccfbf3609aefefa2558ac0850d42275
Reviewers: arsenm, rampitec, t-tye, b-sumner, jsjodin
Subscribers: qcolombet, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61494
llvm-svn: 364297
2019-06-25 19:52:30 +08:00
|
|
|
(OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) {
|
2018-08-29 02:34:24 +08:00
|
|
|
MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
|
|
|
|
|
|
|
|
// Verify the other operand is a VGPR, otherwise we would violate the
|
|
|
|
// constant bus restriction.
|
|
|
|
unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0;
|
|
|
|
MachineOperand &OtherOp = MI->getOperand(OtherIdx);
|
|
|
|
if (!OtherOp.isReg() ||
|
|
|
|
!TII->getRegisterInfo().isVGPR(MRI, OtherOp.getReg()))
|
|
|
|
return false;
|
|
|
|
|
2018-08-29 03:19:03 +08:00
|
|
|
assert(MI->getOperand(1).isDef());
|
2018-08-29 02:34:24 +08:00
|
|
|
|
2019-05-03 21:42:56 +08:00
|
|
|
// Make sure to get the 32-bit version of the commuted opcode.
|
|
|
|
unsigned MaybeCommutedOpc = MI->getOpcode();
|
|
|
|
int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
|
|
|
|
|
[AMDGPU] Skip additional folding on the same operand.
Reviewers: rampitec, arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69355
2019-10-24 03:19:06 +08:00
|
|
|
appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32);
|
2018-08-29 02:34:24 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2017-06-03 08:41:52 +08:00
|
|
|
TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);
|
2015-01-08 06:44:19 +08:00
|
|
|
return false;
|
2017-06-03 08:41:52 +08:00
|
|
|
}
|
|
|
|
|
[AMDGPU] Skip additional folding on the same operand.
Reviewers: rampitec, arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69355
2019-10-24 03:19:06 +08:00
|
|
|
appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true);
|
2017-06-03 08:41:52 +08:00
|
|
|
return true;
|
2015-01-08 06:44:19 +08:00
|
|
|
}
|
AMDGPU: Avoid folding 2 constant operands into an SALU operation
Summary:
Catch the (admittedly unusual) case where SIFoldOperands attempts to fold 2
constant operands into the same SALU operation, with neither operand able to be
encoded as an inline constant.
Change-Id: Ibc48d662c9ffd8bbacd154976b0b1c257ace0927
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D70896
2019-12-02 21:01:26 +08:00
|
|
|
|
|
|
|
// Check the case where we might introduce a second constant operand to a
|
|
|
|
// scalar instruction
|
|
|
|
if (TII->isSALU(MI->getOpcode())) {
|
|
|
|
const MCInstrDesc &InstDesc = MI->getDesc();
|
|
|
|
const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];
|
|
|
|
const SIRegisterInfo &SRI = TII->getRegisterInfo();
|
|
|
|
|
|
|
|
// Fine if the operand can be encoded as an inline constant
|
|
|
|
if (OpToFold->isImm()) {
|
|
|
|
if (!SRI.opCanUseInlineConstant(OpInfo.OperandType) ||
|
|
|
|
!TII->isInlineConstant(*OpToFold, OpInfo)) {
|
|
|
|
// Otherwise check for another constant
|
|
|
|
for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) {
|
|
|
|
auto &Op = MI->getOperand(i);
|
|
|
|
if (OpNo != i &&
|
|
|
|
TII->isLiteralConstantLike(Op, OpInfo)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2015-01-08 06:44:19 +08:00
|
|
|
|
[AMDGPU] Skip additional folding on the same operand.
Reviewers: rampitec, arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69355
2019-10-24 03:19:06 +08:00
|
|
|
appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
|
2015-01-08 06:44:19 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-10-07 02:12:13 +08:00
|
|
|
// If the use operand doesn't care about the value, this may be an operand only
|
|
|
|
// used for register indexing, in which case it is unsafe to fold.
|
2017-05-31 00:49:24 +08:00
|
|
|
static bool isUseSafeToFold(const SIInstrInfo *TII,
|
|
|
|
const MachineInstr &MI,
|
2016-10-07 02:12:13 +08:00
|
|
|
const MachineOperand &UseMO) {
|
2020-07-30 08:17:45 +08:00
|
|
|
if (UseMO.isUndef() || TII->isSDWA(MI))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
switch (MI.getOpcode()) {
|
|
|
|
case AMDGPU::V_MOV_B32_e32:
|
|
|
|
case AMDGPU::V_MOV_B32_e64:
|
|
|
|
case AMDGPU::V_MOV_B64_PSEUDO:
|
|
|
|
// Do not fold into an indirect mov.
|
|
|
|
return !MI.hasRegisterImplicitUseOperand(AMDGPU::M0);
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
2016-10-07 02:12:13 +08:00
|
|
|
//return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
|
|
|
|
}
|
|
|
|
|
2019-10-25 03:36:24 +08:00
|
|
|
// Find a def of the UseReg, check if it is a reg_seqence and find initializers
|
|
|
|
// for each subreg, tracking it to foldable inline immediate if possible.
|
|
|
|
// Returns true on success.
|
|
|
|
static bool getRegSeqInit(
|
|
|
|
SmallVectorImpl<std::pair<MachineOperand*, unsigned>> &Defs,
|
|
|
|
Register UseReg, uint8_t OpTy,
|
|
|
|
const SIInstrInfo *TII, const MachineRegisterInfo &MRI) {
|
|
|
|
MachineInstr *Def = MRI.getUniqueVRegDef(UseReg);
|
|
|
|
if (!Def || !Def->isRegSequence())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
|
|
|
|
MachineOperand *Sub = &Def->getOperand(I);
|
|
|
|
assert (Sub->isReg());
|
|
|
|
|
|
|
|
for (MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub->getReg());
|
|
|
|
SubDef && Sub->isReg() && !Sub->getSubReg() &&
|
|
|
|
TII->isFoldableCopy(*SubDef);
|
|
|
|
SubDef = MRI.getUniqueVRegDef(Sub->getReg())) {
|
|
|
|
MachineOperand *Op = &SubDef->getOperand(1);
|
|
|
|
if (Op->isImm()) {
|
|
|
|
if (TII->isInlineConstant(*Op, OpTy))
|
|
|
|
Sub = Op;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (!Op->isReg())
|
|
|
|
break;
|
|
|
|
Sub = Op;
|
|
|
|
}
|
|
|
|
|
|
|
|
Defs.push_back(std::make_pair(Sub, Def->getOperand(I + 1).getImm()));
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-07-12 05:19:33 +08:00
|
|
|
static bool tryToFoldACImm(const SIInstrInfo *TII,
|
|
|
|
const MachineOperand &OpToFold,
|
|
|
|
MachineInstr *UseMI,
|
|
|
|
unsigned UseOpIdx,
|
|
|
|
SmallVectorImpl<FoldCandidate> &FoldList) {
|
|
|
|
const MCInstrDesc &Desc = UseMI->getDesc();
|
|
|
|
const MCOperandInfo *OpInfo = Desc.OpInfo;
|
|
|
|
if (!OpInfo || UseOpIdx >= Desc.getNumOperands())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
uint8_t OpTy = OpInfo[UseOpIdx].OperandType;
|
|
|
|
if (OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST ||
|
|
|
|
OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST)
|
|
|
|
return false;
|
|
|
|
|
2019-08-24 06:09:58 +08:00
|
|
|
if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
|
|
|
|
TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
|
2019-07-12 05:19:33 +08:00
|
|
|
UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!OpToFold.isReg())
|
|
|
|
return false;
|
|
|
|
|
Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Partial reverts in:
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
X86FixupLEAs.cpp - Some functions return unsigned and arguably should be MCRegister
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
HexagonBitSimplify.cpp - Function takes BitTracker::RegisterRef which appears to be unsigned&
MachineVerifier.cpp - Ambiguous operator==() given MCRegister and const Register
PPCFastISel.cpp - No Register::operator-=()
PeepholeOptimizer.cpp - TargetInstrInfo::optimizeLoadInstr() takes an unsigned&
MachineTraceMetrics.cpp - MachineTraceMetrics lacks a suitable constructor
Manual fixups in:
ARMFastISel.cpp - ARMEmitLoad() now takes a Register& instead of unsigned&
HexagonSplitDouble.cpp - Ternary operator was ambiguous between unsigned/Register
HexagonConstExtenders.cpp - Has a local class named Register, used llvm::Register instead of Register.
PPCFastISel.cpp - PPCEmitLoad() now takes a Register& instead of unsigned&
Depends on D65919
Reviewers: arsenm, bogner, craig.topper, RKSimon
Reviewed By: arsenm
Subscribers: RKSimon, craig.topper, lenary, aemerson, wuzish, jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65962
llvm-svn: 369041
2019-08-16 03:22:08 +08:00
|
|
|
Register UseReg = OpToFold.getReg();
|
2020-08-21 00:46:16 +08:00
|
|
|
if (!UseReg.isVirtual())
|
2019-07-12 05:19:33 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
if (llvm::find_if(FoldList, [UseMI](const FoldCandidate &FC) {
|
|
|
|
return FC.UseMI == UseMI; }) != FoldList.end())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo();
|
2019-10-25 03:36:24 +08:00
|
|
|
SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
|
|
|
|
if (!getRegSeqInit(Defs, UseReg, OpTy, TII, MRI))
|
2019-07-12 05:19:33 +08:00
|
|
|
return false;
|
|
|
|
|
2019-10-25 03:36:24 +08:00
|
|
|
int32_t Imm;
|
|
|
|
for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
|
|
|
|
const MachineOperand *Op = Defs[I].first;
|
|
|
|
if (!Op->isImm())
|
2019-07-12 05:19:33 +08:00
|
|
|
return false;
|
2019-10-25 03:36:24 +08:00
|
|
|
|
2019-07-12 05:19:33 +08:00
|
|
|
auto SubImm = Op->getImm();
|
2019-10-25 03:36:24 +08:00
|
|
|
if (!I) {
|
|
|
|
Imm = SubImm;
|
|
|
|
if (!TII->isInlineConstant(*Op, OpTy) ||
|
|
|
|
!TII->isOperandLegal(*UseMI, UseOpIdx, Op))
|
2019-07-12 05:19:33 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (Imm != SubImm)
|
|
|
|
return false; // Can only fold splat constants
|
|
|
|
}
|
|
|
|
|
2019-10-25 03:36:24 +08:00
|
|
|
appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first);
|
2019-07-12 05:19:33 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2017-01-11 07:32:04 +08:00
|
|
|
void SIFoldOperands::foldOperand(
|
|
|
|
MachineOperand &OpToFold,
|
|
|
|
MachineInstr *UseMI,
|
2019-06-24 22:53:56 +08:00
|
|
|
int UseOpIdx,
|
2017-01-11 07:32:04 +08:00
|
|
|
SmallVectorImpl<FoldCandidate> &FoldList,
|
|
|
|
SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
|
2015-08-29 07:45:19 +08:00
|
|
|
const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
|
|
|
|
|
2017-05-31 00:49:24 +08:00
|
|
|
if (!isUseSafeToFold(TII, *UseMI, UseOp))
|
2016-10-07 02:12:13 +08:00
|
|
|
return;
|
|
|
|
|
2015-08-29 07:45:19 +08:00
|
|
|
// FIXME: Fold operands with subregs.
|
2016-08-16 00:18:36 +08:00
|
|
|
if (UseOp.isReg() && OpToFold.isReg()) {
|
|
|
|
if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister)
|
|
|
|
return;
|
2015-08-29 07:45:19 +08:00
|
|
|
}
|
|
|
|
|
2015-09-09 23:43:26 +08:00
|
|
|
// Special case for REG_SEQUENCE: We can't fold literals into
|
|
|
|
// REG_SEQUENCE instructions, so we have to fold them into the
|
|
|
|
// uses of REG_SEQUENCE.
|
2016-11-24 05:51:07 +08:00
|
|
|
if (UseMI->isRegSequence()) {
|
Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Partial reverts in:
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
X86FixupLEAs.cpp - Some functions return unsigned and arguably should be MCRegister
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
HexagonBitSimplify.cpp - Function takes BitTracker::RegisterRef which appears to be unsigned&
MachineVerifier.cpp - Ambiguous operator==() given MCRegister and const Register
PPCFastISel.cpp - No Register::operator-=()
PeepholeOptimizer.cpp - TargetInstrInfo::optimizeLoadInstr() takes an unsigned&
MachineTraceMetrics.cpp - MachineTraceMetrics lacks a suitable constructor
Manual fixups in:
ARMFastISel.cpp - ARMEmitLoad() now takes a Register& instead of unsigned&
HexagonSplitDouble.cpp - Ternary operator was ambiguous between unsigned/Register
HexagonConstExtenders.cpp - Has a local class named Register, used llvm::Register instead of Register.
PPCFastISel.cpp - PPCEmitLoad() now takes a Register& instead of unsigned&
Depends on D65919
Reviewers: arsenm, bogner, craig.topper, RKSimon
Reviewed By: arsenm
Subscribers: RKSimon, craig.topper, lenary, aemerson, wuzish, jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65962
llvm-svn: 369041
2019-08-16 03:22:08 +08:00
|
|
|
Register RegSeqDstReg = UseMI->getOperand(0).getReg();
|
2015-09-09 23:43:26 +08:00
|
|
|
unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
|
|
|
|
|
2020-07-23 03:20:38 +08:00
|
|
|
MachineRegisterInfo::use_nodbg_iterator Next;
|
|
|
|
for (MachineRegisterInfo::use_nodbg_iterator
|
|
|
|
RSUse = MRI->use_nodbg_begin(RegSeqDstReg), RSE = MRI->use_nodbg_end();
|
2019-06-20 04:44:15 +08:00
|
|
|
RSUse != RSE; RSUse = Next) {
|
|
|
|
Next = std::next(RSUse);
|
2015-09-09 23:43:26 +08:00
|
|
|
|
|
|
|
MachineInstr *RSUseMI = RSUse->getParent();
|
2019-07-12 05:19:33 +08:00
|
|
|
|
|
|
|
if (tryToFoldACImm(TII, UseMI->getOperand(0), RSUseMI,
|
|
|
|
RSUse.getOperandNo(), FoldList))
|
|
|
|
continue;
|
|
|
|
|
2015-09-09 23:43:26 +08:00
|
|
|
if (RSUse->getSubReg() != RegSeqDstSubReg)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
|
2017-01-11 07:32:04 +08:00
|
|
|
CopiesToReplace);
|
2015-09-09 23:43:26 +08:00
|
|
|
}
|
2016-11-24 05:51:07 +08:00
|
|
|
|
2015-09-09 23:43:26 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2019-07-12 05:19:33 +08:00
|
|
|
if (tryToFoldACImm(TII, OpToFold, UseMI, UseOpIdx, FoldList))
|
|
|
|
return;
|
|
|
|
|
2019-06-24 22:53:56 +08:00
|
|
|
if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) {
|
|
|
|
// Sanity check that this is a stack access.
|
|
|
|
// FIXME: Should probably use stack pseudos before frame lowering.
|
|
|
|
|
|
|
|
if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
|
|
|
|
MFI->getScratchRSrcReg())
|
|
|
|
return;
|
2015-08-29 07:45:19 +08:00
|
|
|
|
2020-01-22 06:27:57 +08:00
|
|
|
// Ensure this is either relative to the current frame or the current wave.
|
|
|
|
MachineOperand &SOff =
|
|
|
|
*TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
|
|
|
|
if ((!SOff.isReg() || SOff.getReg() != MFI->getStackPtrOffsetReg()) &&
|
|
|
|
(!SOff.isImm() || SOff.getImm() != 0))
|
|
|
|
return;
|
|
|
|
|
2019-06-24 22:53:56 +08:00
|
|
|
// A frame index will resolve to a positive constant, so it should always be
|
|
|
|
// safe to fold the addressing mode, even pre-GFX9.
|
|
|
|
UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
|
2020-01-22 06:27:57 +08:00
|
|
|
|
|
|
|
// If this is relative to the current wave, update it to be relative to the
|
|
|
|
// current frame.
|
|
|
|
if (SOff.isImm())
|
|
|
|
SOff.ChangeToRegister(MFI->getStackPtrOffsetReg(), false);
|
2019-06-24 22:53:56 +08:00
|
|
|
return;
|
|
|
|
}
|
2015-08-29 07:45:19 +08:00
|
|
|
|
AMDGPU: Write LDS objects out as global symbols in code generation
Summary:
The symbols use the processor-specific SHN_AMDGPU_LDS section index
introduced with a previous change. The linker is then expected to resolve
relocations, which are also emitted.
Initially disabled for HSA and PAL environments until they have caught up
in terms of linker and runtime loader.
Some notes:
- The llvm.amdgcn.groupstaticsize intrinsics can no longer be lowered
to a constant at compile times, which means some tests can no longer
be applied.
The current "solution" is a terrible hack, but the intrinsic isn't
used by Mesa, so we can keep it for now.
- We no longer know the full LDS size per kernel at compile time, which
means that we can no longer generate a relevant error message at
compile time. It would be possible to add a check for the size of
individual variables, but ultimately the linker will have to perform
the final check.
Change-Id: If66dbf33fccfbf3609aefefa2558ac0850d42275
Reviewers: arsenm, rampitec, t-tye, b-sumner, jsjodin
Subscribers: qcolombet, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61494
llvm-svn: 364297
2019-06-25 19:52:30 +08:00
|
|
|
bool FoldingImmLike =
|
|
|
|
OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
|
2019-06-24 22:53:56 +08:00
|
|
|
|
|
|
|
if (FoldingImmLike && UseMI->isCopy()) {
|
Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Partial reverts in:
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
X86FixupLEAs.cpp - Some functions return unsigned and arguably should be MCRegister
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
HexagonBitSimplify.cpp - Function takes BitTracker::RegisterRef which appears to be unsigned&
MachineVerifier.cpp - Ambiguous operator==() given MCRegister and const Register
PPCFastISel.cpp - No Register::operator-=()
PeepholeOptimizer.cpp - TargetInstrInfo::optimizeLoadInstr() takes an unsigned&
MachineTraceMetrics.cpp - MachineTraceMetrics lacks a suitable constructor
Manual fixups in:
ARMFastISel.cpp - ARMEmitLoad() now takes a Register& instead of unsigned&
HexagonSplitDouble.cpp - Ternary operator was ambiguous between unsigned/Register
HexagonConstExtenders.cpp - Has a local class named Register, used llvm::Register instead of Register.
PPCFastISel.cpp - PPCEmitLoad() now takes a Register& instead of unsigned&
Depends on D65919
Reviewers: arsenm, bogner, craig.topper, RKSimon
Reviewed By: arsenm
Subscribers: RKSimon, craig.topper, lenary, aemerson, wuzish, jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65962
llvm-svn: 369041
2019-08-16 03:22:08 +08:00
|
|
|
Register DestReg = UseMI->getOperand(0).getReg();
|
2020-07-20 10:57:24 +08:00
|
|
|
Register SrcReg = UseMI->getOperand(1).getReg();
|
|
|
|
assert(SrcReg.isVirtual());
|
2019-10-10 06:51:42 +08:00
|
|
|
|
2020-07-20 10:57:24 +08:00
|
|
|
const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
|
2019-10-10 06:51:42 +08:00
|
|
|
|
2020-07-20 10:57:24 +08:00
|
|
|
// Don't fold into a copy to a physical register with the same class. Doing
|
|
|
|
// so would interfere with the register coalescer's logic which would avoid
|
|
|
|
// redundant initalizations.
|
|
|
|
if (DestReg.isPhysical() && SrcRC->contains(DestReg))
|
|
|
|
return;
|
2016-11-24 05:51:07 +08:00
|
|
|
|
2020-07-20 10:57:24 +08:00
|
|
|
const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
|
2020-08-09 08:28:48 +08:00
|
|
|
if (!DestReg.isPhysical()) {
|
|
|
|
if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
|
2020-07-23 03:20:38 +08:00
|
|
|
MachineRegisterInfo::use_nodbg_iterator NextUse;
|
2020-08-09 08:28:48 +08:00
|
|
|
SmallVector<FoldCandidate, 4> CopyUses;
|
2020-07-23 03:20:38 +08:00
|
|
|
for (MachineRegisterInfo::use_nodbg_iterator Use = MRI->use_nodbg_begin(DestReg),
|
|
|
|
E = MRI->use_nodbg_end();
|
2020-08-09 08:28:48 +08:00
|
|
|
Use != E; Use = NextUse) {
|
|
|
|
NextUse = std::next(Use);
|
|
|
|
// There's no point trying to fold into an implicit operand.
|
|
|
|
if (Use->isImplicit())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
FoldCandidate FC = FoldCandidate(Use->getParent(), Use.getOperandNo(),
|
|
|
|
&UseMI->getOperand(1));
|
|
|
|
CopyUses.push_back(FC);
|
|
|
|
}
|
|
|
|
for (auto &F : CopyUses) {
|
|
|
|
foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, FoldList, CopiesToReplace);
|
|
|
|
}
|
2018-08-30 21:55:04 +08:00
|
|
|
}
|
|
|
|
|
2020-08-09 08:28:48 +08:00
|
|
|
if (DestRC == &AMDGPU::AGPR_32RegClass &&
|
|
|
|
TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
|
|
|
|
UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32));
|
|
|
|
UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
|
|
|
|
CopiesToReplace.push_back(UseMI);
|
|
|
|
return;
|
|
|
|
}
|
2019-07-12 05:19:33 +08:00
|
|
|
}
|
|
|
|
|
2018-08-30 21:55:04 +08:00
|
|
|
// In order to fold immediates into copies, we need to change the
|
|
|
|
// copy to a MOV.
|
|
|
|
|
2016-11-24 05:51:07 +08:00
|
|
|
unsigned MovOp = TII->getMovOpcode(DestRC);
|
|
|
|
if (MovOp == AMDGPU::COPY)
|
|
|
|
return;
|
|
|
|
|
|
|
|
UseMI->setDesc(TII->get(MovOp));
|
2019-08-21 23:15:04 +08:00
|
|
|
MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin();
|
|
|
|
MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end();
|
|
|
|
while (ImpOpI != ImpOpE) {
|
|
|
|
MachineInstr::mop_iterator Tmp = ImpOpI;
|
|
|
|
ImpOpI++;
|
|
|
|
UseMI->RemoveOperand(UseMI->getOperandNo(Tmp));
|
|
|
|
}
|
2016-11-24 05:51:07 +08:00
|
|
|
CopiesToReplace.push_back(UseMI);
|
|
|
|
} else {
|
2018-09-28 02:55:20 +08:00
|
|
|
if (UseMI->isCopy() && OpToFold.isReg() &&
|
2019-10-22 04:43:01 +08:00
|
|
|
UseMI->getOperand(0).getReg().isVirtual() &&
|
2018-09-28 02:55:20 +08:00
|
|
|
!UseMI->getOperand(1).getSubReg()) {
|
2019-10-22 04:43:01 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "Folding " << OpToFold
|
|
|
|
<< "\n into " << *UseMI << '\n');
|
2019-07-12 05:19:33 +08:00
|
|
|
unsigned Size = TII->getOpSize(*UseMI, 1);
|
2019-10-25 03:36:24 +08:00
|
|
|
Register UseReg = OpToFold.getReg();
|
|
|
|
UseMI->getOperand(1).setReg(UseReg);
|
2018-09-28 02:55:20 +08:00
|
|
|
UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
|
|
|
|
UseMI->getOperand(1).setIsKill(false);
|
|
|
|
CopiesToReplace.push_back(UseMI);
|
|
|
|
OpToFold.setIsKill(false);
|
2019-10-25 03:36:24 +08:00
|
|
|
|
|
|
|
// That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
|
|
|
|
// can only accept VGPR or inline immediate. Recreate a reg_sequence with
|
|
|
|
// its initializers right here, so we will rematerialize immediates and
|
|
|
|
// avoid copies via different reg classes.
|
|
|
|
SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
|
|
|
|
if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
|
|
|
|
getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32, TII,
|
|
|
|
*MRI)) {
|
|
|
|
const DebugLoc &DL = UseMI->getDebugLoc();
|
|
|
|
MachineBasicBlock &MBB = *UseMI->getParent();
|
|
|
|
|
|
|
|
UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
|
|
|
|
for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
|
|
|
|
UseMI->RemoveOperand(I);
|
|
|
|
|
|
|
|
MachineInstrBuilder B(*MBB.getParent(), UseMI);
|
|
|
|
DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
|
|
|
|
SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs;
|
|
|
|
for (unsigned I = 0; I < Size / 4; ++I) {
|
|
|
|
MachineOperand *Def = Defs[I].first;
|
|
|
|
TargetInstrInfo::RegSubRegPair CopyToVGPR;
|
|
|
|
if (Def->isImm() &&
|
|
|
|
TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
|
|
|
|
int64_t Imm = Def->getImm();
|
|
|
|
|
|
|
|
auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
|
|
|
|
BuildMI(MBB, UseMI, DL,
|
|
|
|
TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), Tmp).addImm(Imm);
|
|
|
|
B.addReg(Tmp);
|
|
|
|
} else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
|
|
|
|
auto Src = getRegSubRegPair(*Def);
|
|
|
|
Def->setIsKill(false);
|
|
|
|
if (!SeenAGPRs.insert(Src)) {
|
|
|
|
// We cannot build a reg_sequence out of the same registers, they
|
|
|
|
// must be copied. Better do it here before copyPhysReg() created
|
|
|
|
// several reads to do the AGPR->VGPR->AGPR copy.
|
|
|
|
CopyToVGPR = Src;
|
|
|
|
} else {
|
|
|
|
B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0,
|
|
|
|
Src.SubReg);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
assert(Def->isReg());
|
|
|
|
Def->setIsKill(false);
|
|
|
|
auto Src = getRegSubRegPair(*Def);
|
|
|
|
|
|
|
|
// Direct copy from SGPR to AGPR is not possible. To avoid creation
|
|
|
|
// of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
|
|
|
|
// create a copy here and track if we already have such a copy.
|
|
|
|
if (TRI->isSGPRReg(*MRI, Src.Reg)) {
|
|
|
|
CopyToVGPR = Src;
|
|
|
|
} else {
|
|
|
|
auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
|
|
|
|
BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
|
|
|
|
B.addReg(Tmp);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (CopyToVGPR.Reg) {
|
|
|
|
Register Vgpr;
|
|
|
|
if (VGPRCopies.count(CopyToVGPR)) {
|
|
|
|
Vgpr = VGPRCopies[CopyToVGPR];
|
|
|
|
} else {
|
|
|
|
Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
|
|
|
BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
|
|
|
|
VGPRCopies[CopyToVGPR] = Vgpr;
|
|
|
|
}
|
|
|
|
auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
|
|
|
|
BuildMI(MBB, UseMI, DL,
|
|
|
|
TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), Tmp).addReg(Vgpr);
|
|
|
|
B.addReg(Tmp);
|
|
|
|
}
|
|
|
|
|
|
|
|
B.addImm(Defs[I].second);
|
|
|
|
}
|
|
|
|
LLVM_DEBUG(dbgs() << "Folded " << *UseMI << '\n');
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2019-07-12 05:19:33 +08:00
|
|
|
if (Size != 4)
|
|
|
|
return;
|
|
|
|
if (TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
|
|
|
|
TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()))
|
|
|
|
UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32));
|
|
|
|
else if (TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
|
|
|
|
TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg()))
|
|
|
|
UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32));
|
2018-09-28 02:55:20 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2019-06-18 20:23:46 +08:00
|
|
|
unsigned UseOpc = UseMI->getOpcode();
|
|
|
|
if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
|
|
|
|
(UseOpc == AMDGPU::V_READLANE_B32 &&
|
|
|
|
(int)UseOpIdx ==
|
|
|
|
AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
|
|
|
|
// %vgpr = V_MOV_B32 imm
|
|
|
|
// %sgpr = V_READFIRSTLANE_B32 %vgpr
|
|
|
|
// =>
|
|
|
|
// %sgpr = S_MOV_B32 imm
|
2019-06-24 22:53:56 +08:00
|
|
|
if (FoldingImmLike) {
|
2019-06-18 20:48:36 +08:00
|
|
|
if (execMayBeModifiedBeforeUse(*MRI,
|
|
|
|
UseMI->getOperand(UseOpIdx).getReg(),
|
|
|
|
*OpToFold.getParent(),
|
[AMDGPU] Fix DPP combiner check for exec modification
Summary:
r363675 changed the exec modification helper function, now called
execMayBeModifiedBeforeUse, so that if no UseMI is specified it checks
all instructions in the basic block, even beyond the last use. That
meant that the DPP combiner no longer worked in any basic block that
ended with a control flow instruction, and in particular it didn't work
on code sequences generated by the atomic optimizer.
Fix it by reinstating the old behaviour but in a new helper function
execMayBeModifiedBeforeAnyUse, and limiting the number of instructions
scanned.
Reviewers: arsenm, vpykhtin
Subscribers: kzhuravl, nemanjai, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kbarton, MaskRay, jfb, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D64393
llvm-svn: 365910
2019-07-12 23:59:40 +08:00
|
|
|
*UseMI))
|
2019-06-18 20:23:46 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
|
2019-06-20 04:44:15 +08:00
|
|
|
|
2019-06-24 22:53:56 +08:00
|
|
|
if (OpToFold.isImm())
|
|
|
|
UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
|
|
|
|
else
|
|
|
|
UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());
|
2019-06-18 20:23:46 +08:00
|
|
|
UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
|
2019-06-18 20:48:36 +08:00
|
|
|
if (execMayBeModifiedBeforeUse(*MRI,
|
|
|
|
UseMI->getOperand(UseOpIdx).getReg(),
|
|
|
|
*OpToFold.getParent(),
|
[AMDGPU] Fix DPP combiner check for exec modification
Summary:
r363675 changed the exec modification helper function, now called
execMayBeModifiedBeforeUse, so that if no UseMI is specified it checks
all instructions in the basic block, even beyond the last use. That
meant that the DPP combiner no longer worked in any basic block that
ended with a control flow instruction, and in particular it didn't work
on code sequences generated by the atomic optimizer.
Fix it by reinstating the old behaviour but in a new helper function
execMayBeModifiedBeforeAnyUse, and limiting the number of instructions
scanned.
Reviewers: arsenm, vpykhtin
Subscribers: kzhuravl, nemanjai, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kbarton, MaskRay, jfb, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D64393
llvm-svn: 365910
2019-07-12 23:59:40 +08:00
|
|
|
*UseMI))
|
2019-06-18 20:23:46 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
// %vgpr = COPY %sgpr0
|
|
|
|
// %sgpr1 = V_READFIRSTLANE_B32 %vgpr
|
|
|
|
// =>
|
|
|
|
// %sgpr1 = COPY %sgpr0
|
|
|
|
UseMI->setDesc(TII->get(AMDGPU::COPY));
|
2019-08-14 02:57:55 +08:00
|
|
|
UseMI->getOperand(1).setReg(OpToFold.getReg());
|
|
|
|
UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
|
|
|
|
UseMI->getOperand(1).setIsKill(false);
|
2019-06-18 20:23:46 +08:00
|
|
|
UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-11-24 05:51:07 +08:00
|
|
|
const MCInstrDesc &UseDesc = UseMI->getDesc();
|
|
|
|
|
|
|
|
// Don't fold into target independent nodes. Target independent opcodes
|
|
|
|
// don't have defined register classes.
|
|
|
|
if (UseDesc.isVariadic() ||
|
2018-02-08 09:12:46 +08:00
|
|
|
UseOp.isImplicit() ||
|
2016-11-24 05:51:07 +08:00
|
|
|
UseDesc.OpInfo[UseOpIdx].RegClass == -1)
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2019-06-24 22:53:56 +08:00
|
|
|
if (!FoldingImmLike) {
|
2016-11-24 05:51:07 +08:00
|
|
|
tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
|
|
|
|
|
|
|
|
// FIXME: We could try to change the instruction from 64-bit to 32-bit
|
|
|
|
// to enable more folding opportunites. The shrink operands pass
|
|
|
|
// already does this.
|
2015-08-29 07:45:19 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2016-11-24 05:51:07 +08:00
|
|
|
|
|
|
|
const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
|
|
|
|
const TargetRegisterClass *FoldRC =
|
2017-01-11 07:32:04 +08:00
|
|
|
TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
|
2016-11-24 05:51:07 +08:00
|
|
|
|
|
|
|
// Split 64-bit constants into 32-bits for folding.
|
|
|
|
if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
|
Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Partial reverts in:
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
X86FixupLEAs.cpp - Some functions return unsigned and arguably should be MCRegister
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
HexagonBitSimplify.cpp - Function takes BitTracker::RegisterRef which appears to be unsigned&
MachineVerifier.cpp - Ambiguous operator==() given MCRegister and const Register
PPCFastISel.cpp - No Register::operator-=()
PeepholeOptimizer.cpp - TargetInstrInfo::optimizeLoadInstr() takes an unsigned&
MachineTraceMetrics.cpp - MachineTraceMetrics lacks a suitable constructor
Manual fixups in:
ARMFastISel.cpp - ARMEmitLoad() now takes a Register& instead of unsigned&
HexagonSplitDouble.cpp - Ternary operator was ambiguous between unsigned/Register
HexagonConstExtenders.cpp - Has a local class named Register, used llvm::Register instead of Register.
PPCFastISel.cpp - PPCEmitLoad() now takes a Register& instead of unsigned&
Depends on D65919
Reviewers: arsenm, bogner, craig.topper, RKSimon
Reviewed By: arsenm
Subscribers: RKSimon, craig.topper, lenary, aemerson, wuzish, jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65962
llvm-svn: 369041
2019-08-16 03:22:08 +08:00
|
|
|
Register UseReg = UseOp.getReg();
|
2019-06-18 20:23:45 +08:00
|
|
|
const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
|
2016-11-24 05:51:07 +08:00
|
|
|
|
|
|
|
if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
|
|
|
|
return;
|
|
|
|
|
2017-02-28 06:15:25 +08:00
|
|
|
APInt Imm(64, OpToFold.getImm());
|
2016-11-24 05:51:07 +08:00
|
|
|
if (UseOp.getSubReg() == AMDGPU::sub0) {
|
|
|
|
Imm = Imm.getLoBits(32);
|
|
|
|
} else {
|
|
|
|
assert(UseOp.getSubReg() == AMDGPU::sub1);
|
|
|
|
Imm = Imm.getHiBits(32);
|
|
|
|
}
|
2017-02-28 06:15:25 +08:00
|
|
|
|
|
|
|
MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
|
|
|
|
tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
|
|
|
|
return;
|
2016-11-24 05:51:07 +08:00
|
|
|
}
|
2015-08-29 07:45:19 +08:00
|
|
|
|
2017-02-28 06:15:25 +08:00
|
|
|
|
|
|
|
|
|
|
|
tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
|
2015-08-29 07:45:19 +08:00
|
|
|
}
|
|
|
|
|
2016-09-14 23:19:03 +08:00
|
|
|
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
|
2017-01-11 07:32:04 +08:00
|
|
|
uint32_t LHS, uint32_t RHS) {
|
2016-09-14 23:19:03 +08:00
|
|
|
switch (Opcode) {
|
|
|
|
case AMDGPU::V_AND_B32_e64:
|
2017-01-11 07:32:04 +08:00
|
|
|
case AMDGPU::V_AND_B32_e32:
|
2016-09-14 23:19:03 +08:00
|
|
|
case AMDGPU::S_AND_B32:
|
|
|
|
Result = LHS & RHS;
|
|
|
|
return true;
|
|
|
|
case AMDGPU::V_OR_B32_e64:
|
2017-01-11 07:32:04 +08:00
|
|
|
case AMDGPU::V_OR_B32_e32:
|
2016-09-14 23:19:03 +08:00
|
|
|
case AMDGPU::S_OR_B32:
|
|
|
|
Result = LHS | RHS;
|
|
|
|
return true;
|
|
|
|
case AMDGPU::V_XOR_B32_e64:
|
2017-01-11 07:32:04 +08:00
|
|
|
case AMDGPU::V_XOR_B32_e32:
|
2016-09-14 23:19:03 +08:00
|
|
|
case AMDGPU::S_XOR_B32:
|
|
|
|
Result = LHS ^ RHS;
|
|
|
|
return true;
|
2020-04-08 02:11:24 +08:00
|
|
|
case AMDGPU::S_XNOR_B32:
|
|
|
|
Result = ~(LHS ^ RHS);
|
|
|
|
return true;
|
|
|
|
case AMDGPU::S_NAND_B32:
|
|
|
|
Result = ~(LHS & RHS);
|
|
|
|
return true;
|
|
|
|
case AMDGPU::S_NOR_B32:
|
|
|
|
Result = ~(LHS | RHS);
|
|
|
|
return true;
|
|
|
|
case AMDGPU::S_ANDN2_B32:
|
|
|
|
Result = LHS & ~RHS;
|
|
|
|
return true;
|
|
|
|
case AMDGPU::S_ORN2_B32:
|
|
|
|
Result = LHS | ~RHS;
|
|
|
|
return true;
|
2017-01-11 07:32:04 +08:00
|
|
|
case AMDGPU::V_LSHL_B32_e64:
|
|
|
|
case AMDGPU::V_LSHL_B32_e32:
|
|
|
|
case AMDGPU::S_LSHL_B32:
|
|
|
|
// The instruction ignores the high bits for out of bounds shifts.
|
|
|
|
Result = LHS << (RHS & 31);
|
|
|
|
return true;
|
|
|
|
case AMDGPU::V_LSHLREV_B32_e64:
|
|
|
|
case AMDGPU::V_LSHLREV_B32_e32:
|
|
|
|
Result = RHS << (LHS & 31);
|
|
|
|
return true;
|
|
|
|
case AMDGPU::V_LSHR_B32_e64:
|
|
|
|
case AMDGPU::V_LSHR_B32_e32:
|
|
|
|
case AMDGPU::S_LSHR_B32:
|
|
|
|
Result = LHS >> (RHS & 31);
|
|
|
|
return true;
|
|
|
|
case AMDGPU::V_LSHRREV_B32_e64:
|
|
|
|
case AMDGPU::V_LSHRREV_B32_e32:
|
|
|
|
Result = RHS >> (LHS & 31);
|
|
|
|
return true;
|
|
|
|
case AMDGPU::V_ASHR_I32_e64:
|
|
|
|
case AMDGPU::V_ASHR_I32_e32:
|
|
|
|
case AMDGPU::S_ASHR_I32:
|
|
|
|
Result = static_cast<int32_t>(LHS) >> (RHS & 31);
|
|
|
|
return true;
|
|
|
|
case AMDGPU::V_ASHRREV_I32_e64:
|
|
|
|
case AMDGPU::V_ASHRREV_I32_e32:
|
|
|
|
Result = static_cast<int32_t>(RHS) >> (LHS & 31);
|
|
|
|
return true;
|
2016-09-14 23:19:03 +08:00
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned getMovOpc(bool IsScalar) {
|
|
|
|
return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
|
|
|
|
}
|
|
|
|
|
2016-10-07 01:54:30 +08:00
|
|
|
/// Remove any leftover implicit operands from mutating the instruction. e.g.
|
|
|
|
/// if we replace an s_and_b32 with a copy, we don't need the implicit scc def
|
|
|
|
/// anymore.
|
|
|
|
static void stripExtraCopyOperands(MachineInstr &MI) {
|
|
|
|
const MCInstrDesc &Desc = MI.getDesc();
|
|
|
|
unsigned NumOps = Desc.getNumOperands() +
|
|
|
|
Desc.getNumImplicitUses() +
|
|
|
|
Desc.getNumImplicitDefs();
|
|
|
|
|
|
|
|
for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
|
|
|
|
MI.RemoveOperand(I);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
|
|
|
|
MI.setDesc(NewDesc);
|
|
|
|
stripExtraCopyOperands(MI);
|
|
|
|
}
|
|
|
|
|
2017-01-11 07:32:04 +08:00
|
|
|
static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
|
|
|
|
MachineOperand &Op) {
|
|
|
|
if (Op.isReg()) {
|
|
|
|
// If this has a subregister, it obviously is a register source.
|
2020-08-21 00:46:16 +08:00
|
|
|
if (Op.getSubReg() != AMDGPU::NoSubRegister || !Op.getReg().isVirtual())
|
2017-01-11 07:32:04 +08:00
|
|
|
return &Op;
|
|
|
|
|
|
|
|
MachineInstr *Def = MRI.getVRegDef(Op.getReg());
|
2017-06-21 02:28:02 +08:00
|
|
|
if (Def && Def->isMoveImmediate()) {
|
2017-01-11 07:32:04 +08:00
|
|
|
MachineOperand &ImmSrc = Def->getOperand(1);
|
|
|
|
if (ImmSrc.isImm())
|
|
|
|
return &ImmSrc;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return &Op;
|
|
|
|
}
|
|
|
|
|
2016-09-14 23:19:03 +08:00
|
|
|
// Try to simplify operations with a constant that may appear after instruction
|
|
|
|
// selection.
|
2017-01-11 07:32:04 +08:00
|
|
|
// TODO: See if a frame index with a fixed offset can fold.
|
2016-09-14 23:19:03 +08:00
|
|
|
static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
|
|
|
|
const SIInstrInfo *TII,
|
2017-01-11 07:32:04 +08:00
|
|
|
MachineInstr *MI,
|
|
|
|
MachineOperand *ImmOp) {
|
2016-09-14 23:19:03 +08:00
|
|
|
unsigned Opc = MI->getOpcode();
|
|
|
|
if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
|
|
|
|
Opc == AMDGPU::S_NOT_B32) {
|
2017-01-11 07:32:04 +08:00
|
|
|
MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm());
|
|
|
|
mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
|
|
|
|
return true;
|
2016-09-14 23:19:03 +08:00
|
|
|
}
|
|
|
|
|
2017-01-11 07:32:04 +08:00
|
|
|
int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
|
|
|
|
if (Src1Idx == -1)
|
2016-09-14 23:19:03 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
|
2017-01-11 07:32:04 +08:00
|
|
|
MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
|
|
|
|
MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx));
|
2016-09-14 23:19:03 +08:00
|
|
|
|
|
|
|
if (!Src0->isImm() && !Src1->isImm())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// and k0, k1 -> v_mov_b32 (k0 & k1)
|
|
|
|
// or k0, k1 -> v_mov_b32 (k0 | k1)
|
|
|
|
// xor k0, k1 -> v_mov_b32 (k0 ^ k1)
|
|
|
|
if (Src0->isImm() && Src1->isImm()) {
|
|
|
|
int32_t NewImm;
|
|
|
|
if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
const SIRegisterInfo &TRI = TII->getRegisterInfo();
|
|
|
|
bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg());
|
|
|
|
|
2017-01-11 07:32:04 +08:00
|
|
|
// Be careful to change the right operand, src0 may belong to a different
|
|
|
|
// instruction.
|
|
|
|
MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
|
2016-09-14 23:19:03 +08:00
|
|
|
MI->RemoveOperand(Src1Idx);
|
2016-10-07 01:54:30 +08:00
|
|
|
mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
|
2016-09-14 23:19:03 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2017-01-11 07:32:04 +08:00
|
|
|
if (!MI->isCommutable())
|
|
|
|
return false;
|
|
|
|
|
2016-09-14 23:19:03 +08:00
|
|
|
if (Src0->isImm() && !Src1->isImm()) {
|
|
|
|
std::swap(Src0, Src1);
|
|
|
|
std::swap(Src0Idx, Src1Idx);
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
|
2017-01-11 07:32:04 +08:00
|
|
|
if (Opc == AMDGPU::V_OR_B32_e64 ||
|
|
|
|
Opc == AMDGPU::V_OR_B32_e32 ||
|
|
|
|
Opc == AMDGPU::S_OR_B32) {
|
2016-09-14 23:19:03 +08:00
|
|
|
if (Src1Val == 0) {
|
|
|
|
// y = or x, 0 => y = copy x
|
|
|
|
MI->RemoveOperand(Src1Idx);
|
2016-10-07 01:54:30 +08:00
|
|
|
mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
|
2016-09-14 23:19:03 +08:00
|
|
|
} else if (Src1Val == -1) {
|
|
|
|
// y = or x, -1 => y = v_mov_b32 -1
|
|
|
|
MI->RemoveOperand(Src1Idx);
|
2016-10-07 01:54:30 +08:00
|
|
|
mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
|
2016-09-14 23:19:03 +08:00
|
|
|
} else
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 ||
|
2017-01-11 07:32:04 +08:00
|
|
|
MI->getOpcode() == AMDGPU::V_AND_B32_e32 ||
|
2016-09-14 23:19:03 +08:00
|
|
|
MI->getOpcode() == AMDGPU::S_AND_B32) {
|
|
|
|
if (Src1Val == 0) {
|
|
|
|
// y = and x, 0 => y = v_mov_b32 0
|
|
|
|
MI->RemoveOperand(Src0Idx);
|
2016-10-07 01:54:30 +08:00
|
|
|
mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
|
2016-09-14 23:19:03 +08:00
|
|
|
} else if (Src1Val == -1) {
|
|
|
|
// y = and x, -1 => y = copy x
|
|
|
|
MI->RemoveOperand(Src1Idx);
|
2016-10-07 01:54:30 +08:00
|
|
|
mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
|
|
|
|
stripExtraCopyOperands(*MI);
|
2016-09-14 23:19:03 +08:00
|
|
|
} else
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 ||
|
2017-01-11 07:32:04 +08:00
|
|
|
MI->getOpcode() == AMDGPU::V_XOR_B32_e32 ||
|
2016-09-14 23:19:03 +08:00
|
|
|
MI->getOpcode() == AMDGPU::S_XOR_B32) {
|
|
|
|
if (Src1Val == 0) {
|
|
|
|
// y = xor x, 0 => y = copy x
|
|
|
|
MI->RemoveOperand(Src1Idx);
|
2016-10-07 01:54:30 +08:00
|
|
|
mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
|
2017-01-11 07:32:04 +08:00
|
|
|
return true;
|
2016-09-14 23:19:03 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2017-03-25 02:55:20 +08:00
|
|
|
// Try to fold an instruction into a simpler one
|
|
|
|
static bool tryFoldInst(const SIInstrInfo *TII,
|
|
|
|
MachineInstr *MI) {
|
|
|
|
unsigned Opc = MI->getOpcode();
|
|
|
|
|
|
|
|
if (Opc == AMDGPU::V_CNDMASK_B32_e32 ||
|
|
|
|
Opc == AMDGPU::V_CNDMASK_B32_e64 ||
|
|
|
|
Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) {
|
|
|
|
const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
|
|
|
|
const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
|
2019-03-19 03:25:39 +08:00
|
|
|
int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
|
|
|
|
int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
|
|
|
|
if (Src1->isIdenticalTo(*Src0) &&
|
|
|
|
(Src1ModIdx == -1 || !MI->getOperand(Src1ModIdx).getImm()) &&
|
|
|
|
(Src0ModIdx == -1 || !MI->getOperand(Src0ModIdx).getImm())) {
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "Folded " << *MI << " into ");
|
2019-03-19 03:25:39 +08:00
|
|
|
auto &NewDesc =
|
|
|
|
TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
|
2017-03-25 02:55:20 +08:00
|
|
|
int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
|
|
|
|
if (Src2Idx != -1)
|
|
|
|
MI->RemoveOperand(Src2Idx);
|
|
|
|
MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
|
2019-03-19 03:25:39 +08:00
|
|
|
if (Src1ModIdx != -1)
|
|
|
|
MI->RemoveOperand(Src1ModIdx);
|
|
|
|
if (Src0ModIdx != -1)
|
|
|
|
MI->RemoveOperand(Src0ModIdx);
|
|
|
|
mutateCopyOp(*MI, NewDesc);
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << *MI << '\n');
|
2017-03-25 02:55:20 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2017-01-11 07:32:04 +08:00
|
|
|
void SIFoldOperands::foldInstOperand(MachineInstr &MI,
|
|
|
|
MachineOperand &OpToFold) const {
|
|
|
|
// We need mutate the operands of new mov instructions to add implicit
|
|
|
|
// uses of EXEC, but adding them invalidates the use_iterator, so defer
|
|
|
|
// this.
|
|
|
|
SmallVector<MachineInstr *, 4> CopiesToReplace;
|
|
|
|
SmallVector<FoldCandidate, 4> FoldList;
|
|
|
|
MachineOperand &Dst = MI.getOperand(0);
|
|
|
|
|
AMDGPU: Write LDS objects out as global symbols in code generation
Summary:
The symbols use the processor-specific SHN_AMDGPU_LDS section index
introduced with a previous change. The linker is then expected to resolve
relocations, which are also emitted.
Initially disabled for HSA and PAL environments until they have caught up
in terms of linker and runtime loader.
Some notes:
- The llvm.amdgcn.groupstaticsize intrinsics can no longer be lowered
to a constant at compile times, which means some tests can no longer
be applied.
The current "solution" is a terrible hack, but the intrinsic isn't
used by Mesa, so we can keep it for now.
- We no longer know the full LDS size per kernel at compile time, which
means that we can no longer generate a relevant error message at
compile time. It would be possible to add a check for the size of
individual variables, but ultimately the linker will have to perform
the final check.
Change-Id: If66dbf33fccfbf3609aefefa2558ac0850d42275
Reviewers: arsenm, rampitec, t-tye, b-sumner, jsjodin
Subscribers: qcolombet, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61494
llvm-svn: 364297
2019-06-25 19:52:30 +08:00
|
|
|
bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
|
2017-01-11 07:32:04 +08:00
|
|
|
if (FoldingImm) {
|
|
|
|
unsigned NumLiteralUses = 0;
|
|
|
|
MachineOperand *NonInlineUse = nullptr;
|
|
|
|
int NonInlineUseOpNo = -1;
|
|
|
|
|
2020-07-23 03:20:38 +08:00
|
|
|
MachineRegisterInfo::use_nodbg_iterator NextUse;
|
|
|
|
for (MachineRegisterInfo::use_nodbg_iterator
|
|
|
|
Use = MRI->use_nodbg_begin(Dst.getReg()), E = MRI->use_nodbg_end();
|
2017-01-11 07:32:04 +08:00
|
|
|
Use != E; Use = NextUse) {
|
|
|
|
NextUse = std::next(Use);
|
|
|
|
MachineInstr *UseMI = Use->getParent();
|
|
|
|
unsigned OpNo = Use.getOperandNo();
|
|
|
|
|
|
|
|
// Folding the immediate may reveal operations that can be constant
|
|
|
|
// folded or replaced with a copy. This can happen for example after
|
|
|
|
// frame indices are lowered to constants or from splitting 64-bit
|
|
|
|
// constants.
|
|
|
|
//
|
|
|
|
// We may also encounter cases where one or both operands are
|
|
|
|
// immediates materialized into a register, which would ordinarily not
|
|
|
|
// be folded due to multiple uses or operand constraints.
|
|
|
|
|
|
|
|
if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) {
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "Constant folded " << *UseMI << '\n');
|
2017-01-11 07:32:04 +08:00
|
|
|
|
|
|
|
// Some constant folding cases change the same immediate's use to a new
|
|
|
|
// instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
|
|
|
|
// again. The same constant folded instruction could also have a second
|
|
|
|
// use operand.
|
2020-07-23 03:20:38 +08:00
|
|
|
NextUse = MRI->use_nodbg_begin(Dst.getReg());
|
AMDGPU: Fix crash when folding immediates into multiple uses
Summary:
When an immediate is folded by constant folding, we re-scan the entire
use list for two reasons:
1. The constant folding may have created a new use of the same reg.
2. The constant folding may have removed an additional use in the list
we're currently traversing (e.g., constant folding an S_ADD_I32 c, c).
However, this could previously lead to a crash when an unrelated use was
added twice into the FoldList. Since we re-scan the whole list anyway, we
might as well just clear the FoldList again before we do so.
Using a MIR test to show this because real code seems to trigger the issue
only in connection with some really subtle control flow structures.
Fixes GL45-CTS.shading_language_420pack.binding_images on gfx9.
Reviewers: arsenm
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D35416
llvm-svn: 308314
2017-07-18 22:54:41 +08:00
|
|
|
FoldList.clear();
|
2017-01-11 07:32:04 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Try to fold any inline immediate uses, and then only fold other
|
|
|
|
// constants if they have one use.
|
|
|
|
//
|
|
|
|
// The legality of the inline immediate must be checked based on the use
|
|
|
|
// operand, not the defining instruction, because 32-bit instructions
|
|
|
|
// with 32-bit inline immediate sources may be used to materialize
|
|
|
|
// constants used in 16-bit operands.
|
|
|
|
//
|
|
|
|
// e.g. it is unsafe to fold:
|
|
|
|
// s_mov_b32 s0, 1.0 // materializes 0x3f800000
|
|
|
|
// v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
|
|
|
|
|
|
|
|
// Folding immediates with more than one use will increase program size.
|
|
|
|
// FIXME: This will also reduce register usage, which may be better
|
|
|
|
// in some cases. A better heuristic is needed.
|
2017-01-12 06:00:02 +08:00
|
|
|
if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
|
2017-01-11 07:32:04 +08:00
|
|
|
foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
|
2019-06-24 22:53:56 +08:00
|
|
|
} else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) {
|
|
|
|
foldOperand(OpToFold, UseMI, OpNo, FoldList,
|
|
|
|
CopiesToReplace);
|
2017-01-11 07:32:04 +08:00
|
|
|
} else {
|
|
|
|
if (++NumLiteralUses == 1) {
|
|
|
|
NonInlineUse = &*Use;
|
|
|
|
NonInlineUseOpNo = OpNo;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (NumLiteralUses == 1) {
|
|
|
|
MachineInstr *UseMI = NonInlineUse->getParent();
|
|
|
|
foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Folding register.
|
2020-07-23 03:20:38 +08:00
|
|
|
SmallVector <MachineRegisterInfo::use_nodbg_iterator, 4> UsesToProcess;
|
|
|
|
for (MachineRegisterInfo::use_nodbg_iterator
|
|
|
|
Use = MRI->use_nodbg_begin(Dst.getReg()), E = MRI->use_nodbg_end();
|
2017-01-11 07:32:04 +08:00
|
|
|
Use != E; ++Use) {
|
2019-01-04 03:55:32 +08:00
|
|
|
UsesToProcess.push_back(Use);
|
|
|
|
}
|
|
|
|
for (auto U : UsesToProcess) {
|
|
|
|
MachineInstr *UseMI = U->getParent();
|
2017-01-11 07:32:04 +08:00
|
|
|
|
2019-01-04 03:55:32 +08:00
|
|
|
foldOperand(OpToFold, UseMI, U.getOperandNo(),
|
|
|
|
FoldList, CopiesToReplace);
|
2017-01-11 07:32:04 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
MachineFunction *MF = MI.getParent()->getParent();
|
|
|
|
// Make sure we add EXEC uses to any new v_mov instructions created.
|
|
|
|
for (MachineInstr *Copy : CopiesToReplace)
|
|
|
|
Copy->addImplicitDefUseOperands(*MF);
|
|
|
|
|
|
|
|
for (FoldCandidate &Fold : FoldList) {
|
2019-11-05 05:21:19 +08:00
|
|
|
assert(!Fold.isReg() || Fold.OpToFold);
|
2020-08-21 00:46:16 +08:00
|
|
|
if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
|
2019-09-30 23:31:17 +08:00
|
|
|
Register Reg = Fold.OpToFold->getReg();
|
|
|
|
MachineInstr *DefMI = Fold.OpToFold->getParent();
|
|
|
|
if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
|
|
|
|
execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
|
|
|
|
continue;
|
|
|
|
}
|
2019-05-02 12:01:39 +08:00
|
|
|
if (updateOperand(Fold, *TII, *TRI, *ST)) {
|
2017-01-11 07:32:04 +08:00
|
|
|
// Clear kill flags.
|
|
|
|
if (Fold.isReg()) {
|
|
|
|
assert(Fold.OpToFold && Fold.OpToFold->isReg());
|
|
|
|
// FIXME: Probably shouldn't bother trying to fold if not an
|
|
|
|
// SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
|
|
|
|
// copies.
|
|
|
|
MRI->clearKillFlags(Fold.OpToFold->getReg());
|
|
|
|
}
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
|
|
|
|
<< static_cast<int>(Fold.UseOpNo) << " of "
|
|
|
|
<< *Fold.UseMI << '\n');
|
2017-03-25 02:55:20 +08:00
|
|
|
tryFoldInst(TII, Fold.UseMI);
|
2017-06-03 08:41:52 +08:00
|
|
|
} else if (Fold.isCommuted()) {
|
|
|
|
// Restoring instruction's original operand order if fold has failed.
|
|
|
|
TII->commuteInstruction(*Fold.UseMI, false);
|
2017-01-11 07:32:04 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-10-05 08:13:20 +08:00
|
|
|
// Clamp patterns are canonically selected to v_max_* instructions, so only
|
|
|
|
// handle them.
|
2017-02-23 07:27:53 +08:00
|
|
|
const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
|
|
|
|
unsigned Op = MI.getOpcode();
|
|
|
|
switch (Op) {
|
|
|
|
case AMDGPU::V_MAX_F32_e64:
|
2017-02-23 07:53:37 +08:00
|
|
|
case AMDGPU::V_MAX_F16_e64:
|
2017-09-01 07:53:50 +08:00
|
|
|
case AMDGPU::V_MAX_F64:
|
|
|
|
case AMDGPU::V_PK_MAX_F16: {
|
2017-02-23 07:27:53 +08:00
|
|
|
if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
// Make sure sources are identical.
|
|
|
|
const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
|
|
|
|
const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
|
2017-06-05 09:03:04 +08:00
|
|
|
if (!Src0->isReg() || !Src1->isReg() ||
|
2017-10-05 08:13:17 +08:00
|
|
|
Src0->getReg() != Src1->getReg() ||
|
2017-06-05 09:03:04 +08:00
|
|
|
Src0->getSubReg() != Src1->getSubReg() ||
|
2017-02-23 07:27:53 +08:00
|
|
|
Src0->getSubReg() != AMDGPU::NoSubRegister)
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
// Can't fold up if we have modifiers.
|
2017-09-01 07:53:50 +08:00
|
|
|
if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
unsigned Src0Mods
|
|
|
|
= TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
|
|
|
|
unsigned Src1Mods
|
|
|
|
= TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
|
|
|
|
|
|
|
|
// Having a 0 op_sel_hi would require swizzling the output in the source
|
|
|
|
// instruction, which we can't do.
|
2019-03-14 05:15:52 +08:00
|
|
|
unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1
|
|
|
|
: 0u;
|
2017-09-01 07:53:50 +08:00
|
|
|
if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
|
2017-02-23 07:27:53 +08:00
|
|
|
return nullptr;
|
|
|
|
return Src0;
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// We obviously have multiple uses in a clamp since the register is used twice
|
|
|
|
// in the same instruction.
|
|
|
|
static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) {
|
|
|
|
int Count = 0;
|
|
|
|
for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end();
|
|
|
|
I != E; ++I) {
|
|
|
|
if (++Count > 1)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2017-09-21 05:01:24 +08:00
|
|
|
// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
|
2017-02-23 07:27:53 +08:00
|
|
|
bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
|
|
|
|
const MachineOperand *ClampSrc = isClamp(MI);
|
|
|
|
if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg()))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
|
2017-09-01 07:53:50 +08:00
|
|
|
|
|
|
|
// The type of clamp must be compatible.
|
|
|
|
if (TII->getClampMask(*Def) != TII->getClampMask(MI))
|
2017-02-23 07:27:53 +08:00
|
|
|
return false;
|
2017-09-01 07:53:50 +08:00
|
|
|
|
2017-02-23 07:27:53 +08:00
|
|
|
MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
|
|
|
|
if (!DefClamp)
|
|
|
|
return false;
|
|
|
|
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def
|
|
|
|
<< '\n');
|
2017-02-23 07:27:53 +08:00
|
|
|
|
|
|
|
// Clamp is applied after omod, so it is OK if omod is set.
|
|
|
|
DefClamp->setImm(1);
|
|
|
|
MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2017-02-28 03:35:42 +08:00
|
|
|
static int getOModValue(unsigned Opc, int64_t Val) {
|
|
|
|
switch (Opc) {
|
|
|
|
case AMDGPU::V_MUL_F32_e64: {
|
|
|
|
switch (static_cast<uint32_t>(Val)) {
|
|
|
|
case 0x3f000000: // 0.5
|
|
|
|
return SIOutMods::DIV2;
|
|
|
|
case 0x40000000: // 2.0
|
|
|
|
return SIOutMods::MUL2;
|
|
|
|
case 0x40800000: // 4.0
|
|
|
|
return SIOutMods::MUL4;
|
|
|
|
default:
|
|
|
|
return SIOutMods::NONE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
case AMDGPU::V_MUL_F16_e64: {
|
|
|
|
switch (static_cast<uint16_t>(Val)) {
|
|
|
|
case 0x3800: // 0.5
|
|
|
|
return SIOutMods::DIV2;
|
|
|
|
case 0x4000: // 2.0
|
|
|
|
return SIOutMods::MUL2;
|
|
|
|
case 0x4400: // 4.0
|
|
|
|
return SIOutMods::MUL4;
|
|
|
|
default:
|
|
|
|
return SIOutMods::NONE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
llvm_unreachable("invalid mul opcode");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// FIXME: Does this really not support denormals with f16?
|
|
|
|
// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
|
|
|
|
// handled, so will anything other than that break?
|
|
|
|
std::pair<const MachineOperand *, int>
|
|
|
|
SIFoldOperands::isOMod(const MachineInstr &MI) const {
|
|
|
|
unsigned Op = MI.getOpcode();
|
|
|
|
switch (Op) {
|
|
|
|
case AMDGPU::V_MUL_F32_e64:
|
|
|
|
case AMDGPU::V_MUL_F16_e64: {
|
|
|
|
// If output denormals are enabled, omod is ignored.
|
2019-12-03 15:01:21 +08:00
|
|
|
if ((Op == AMDGPU::V_MUL_F32_e64 && MFI->getMode().FP32OutputDenormals) ||
|
|
|
|
(Op == AMDGPU::V_MUL_F16_e64 && MFI->getMode().FP64FP16OutputDenormals))
|
2017-02-28 03:35:42 +08:00
|
|
|
return std::make_pair(nullptr, SIOutMods::NONE);
|
|
|
|
|
|
|
|
const MachineOperand *RegOp = nullptr;
|
|
|
|
const MachineOperand *ImmOp = nullptr;
|
|
|
|
const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
|
|
|
|
const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
|
|
|
|
if (Src0->isImm()) {
|
|
|
|
ImmOp = Src0;
|
|
|
|
RegOp = Src1;
|
|
|
|
} else if (Src1->isImm()) {
|
|
|
|
ImmOp = Src1;
|
|
|
|
RegOp = Src0;
|
|
|
|
} else
|
|
|
|
return std::make_pair(nullptr, SIOutMods::NONE);
|
|
|
|
|
|
|
|
int OMod = getOModValue(Op, ImmOp->getImm());
|
|
|
|
if (OMod == SIOutMods::NONE ||
|
|
|
|
TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
|
|
|
|
TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
|
|
|
|
TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
|
|
|
|
TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
|
|
|
|
return std::make_pair(nullptr, SIOutMods::NONE);
|
|
|
|
|
|
|
|
return std::make_pair(RegOp, OMod);
|
|
|
|
}
|
|
|
|
case AMDGPU::V_ADD_F32_e64:
|
|
|
|
case AMDGPU::V_ADD_F16_e64: {
|
|
|
|
// If output denormals are enabled, omod is ignored.
|
2019-12-03 15:01:21 +08:00
|
|
|
if ((Op == AMDGPU::V_ADD_F32_e64 && MFI->getMode().FP32OutputDenormals) ||
|
|
|
|
(Op == AMDGPU::V_ADD_F16_e64 && MFI->getMode().FP64FP16OutputDenormals))
|
2017-02-28 03:35:42 +08:00
|
|
|
return std::make_pair(nullptr, SIOutMods::NONE);
|
|
|
|
|
|
|
|
// Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
|
|
|
|
const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
|
|
|
|
const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
|
|
|
|
|
|
|
|
if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
|
|
|
|
Src0->getSubReg() == Src1->getSubReg() &&
|
|
|
|
!TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
|
|
|
|
!TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
|
|
|
|
!TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
|
|
|
|
!TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
|
|
|
|
return std::make_pair(Src0, SIOutMods::MUL2);
|
|
|
|
|
|
|
|
return std::make_pair(nullptr, SIOutMods::NONE);
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
return std::make_pair(nullptr, SIOutMods::NONE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// FIXME: Does this need to check IEEE bit on function?
|
|
|
|
bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
|
|
|
|
const MachineOperand *RegOp;
|
|
|
|
int OMod;
|
|
|
|
std::tie(RegOp, OMod) = isOMod(MI);
|
|
|
|
if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
|
|
|
|
RegOp->getSubReg() != AMDGPU::NoSubRegister ||
|
|
|
|
!hasOneNonDBGUseInst(*MRI, RegOp->getReg()))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
|
|
|
|
MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
|
|
|
|
if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Clamp is applied after omod. If the source already has clamp set, don't
|
|
|
|
// fold it.
|
|
|
|
if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
|
|
|
|
return false;
|
|
|
|
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
|
2017-02-28 03:35:42 +08:00
|
|
|
|
|
|
|
DefOMod->setImm(OMod);
|
|
|
|
MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
|
|
|
|
MI.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2014-11-22 06:06:37 +08:00
|
|
|
bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
|
2017-12-16 06:22:58 +08:00
|
|
|
if (skipFunction(MF.getFunction()))
|
2016-04-26 06:23:44 +08:00
|
|
|
return false;
|
|
|
|
|
2017-01-11 07:32:04 +08:00
|
|
|
MRI = &MF.getRegInfo();
|
2018-07-12 04:59:01 +08:00
|
|
|
ST = &MF.getSubtarget<GCNSubtarget>();
|
2017-02-23 07:27:53 +08:00
|
|
|
TII = ST->getInstrInfo();
|
2017-01-11 07:32:04 +08:00
|
|
|
TRI = &TII->getRegisterInfo();
|
2019-06-24 22:53:56 +08:00
|
|
|
MFI = MF.getInfo<SIMachineFunctionInfo>();
|
2017-02-28 03:35:42 +08:00
|
|
|
|
|
|
|
// omod is ignored by hardware if IEEE bit is enabled. omod also does not
|
|
|
|
// correctly handle signed zeros.
|
|
|
|
//
|
2019-03-30 03:14:54 +08:00
|
|
|
// FIXME: Also need to check strictfp
|
|
|
|
bool IsIEEEMode = MFI->getMode().IEEE;
|
2018-08-12 16:44:25 +08:00
|
|
|
bool HasNSZ = MFI->hasNoSignedZerosFPMath();
|
2017-02-28 03:35:42 +08:00
|
|
|
|
2017-06-21 02:56:32 +08:00
|
|
|
for (MachineBasicBlock *MBB : depth_first(&MF)) {
|
2014-11-22 06:06:37 +08:00
|
|
|
MachineBasicBlock::iterator I, Next;
|
2019-10-22 03:53:46 +08:00
|
|
|
|
|
|
|
MachineOperand *CurrentKnownM0Val = nullptr;
|
2017-06-21 02:56:32 +08:00
|
|
|
for (I = MBB->begin(); I != MBB->end(); I = Next) {
|
2014-11-22 06:06:37 +08:00
|
|
|
Next = std::next(I);
|
|
|
|
MachineInstr &MI = *I;
|
|
|
|
|
2017-03-25 02:55:20 +08:00
|
|
|
tryFoldInst(TII, &MI);
|
|
|
|
|
2017-03-31 19:42:43 +08:00
|
|
|
if (!TII->isFoldableCopy(MI)) {
|
2019-10-26 04:33:40 +08:00
|
|
|
// Saw an unknown clobber of m0, so we no longer know what it is.
|
|
|
|
if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
|
|
|
|
CurrentKnownM0Val = nullptr;
|
|
|
|
|
2018-08-12 16:44:25 +08:00
|
|
|
// TODO: Omod might be OK if there is NSZ only on the source
|
|
|
|
// instruction, and not the omod multiply.
|
|
|
|
if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
|
|
|
|
!tryFoldOMod(MI))
|
2017-02-28 03:35:42 +08:00
|
|
|
tryFoldClamp(MI);
|
2019-10-22 03:53:46 +08:00
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Specially track simple redefs of m0 to the same value in a block, so we
|
|
|
|
// can erase the later ones.
|
|
|
|
if (MI.getOperand(0).getReg() == AMDGPU::M0) {
|
|
|
|
MachineOperand &NewM0Val = MI.getOperand(1);
|
|
|
|
if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
|
|
|
|
MI.eraseFromParent();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We aren't tracking other physical registers
|
|
|
|
CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical()) ?
|
|
|
|
nullptr : &NewM0Val;
|
2014-11-22 06:06:37 +08:00
|
|
|
continue;
|
2017-02-23 07:27:53 +08:00
|
|
|
}
|
2014-11-22 06:06:37 +08:00
|
|
|
|
|
|
|
MachineOperand &OpToFold = MI.getOperand(1);
|
AMDGPU: Write LDS objects out as global symbols in code generation
Summary:
The symbols use the processor-specific SHN_AMDGPU_LDS section index
introduced with a previous change. The linker is then expected to resolve
relocations, which are also emitted.
Initially disabled for HSA and PAL environments until they have caught up
in terms of linker and runtime loader.
Some notes:
- The llvm.amdgcn.groupstaticsize intrinsics can no longer be lowered
to a constant at compile times, which means some tests can no longer
be applied.
The current "solution" is a terrible hack, but the intrinsic isn't
used by Mesa, so we can keep it for now.
- We no longer know the full LDS size per kernel at compile time, which
means that we can no longer generate a relevant error message at
compile time. It would be possible to add a check for the size of
individual variables, but ultimately the linker will have to perform
the final check.
Change-Id: If66dbf33fccfbf3609aefefa2558ac0850d42275
Reviewers: arsenm, rampitec, t-tye, b-sumner, jsjodin
Subscribers: qcolombet, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61494
llvm-svn: 364297
2019-06-25 19:52:30 +08:00
|
|
|
bool FoldingImm =
|
|
|
|
OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
|
2015-01-08 06:18:27 +08:00
|
|
|
|
2017-01-11 07:32:04 +08:00
|
|
|
// FIXME: We could also be folding things like TargetIndexes.
|
2015-01-08 06:44:19 +08:00
|
|
|
if (!FoldingImm && !OpToFold.isReg())
|
|
|
|
continue;
|
|
|
|
|
2020-08-21 00:46:16 +08:00
|
|
|
if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
|
2014-11-22 06:06:37 +08:00
|
|
|
continue;
|
|
|
|
|
2016-01-13 19:44:29 +08:00
|
|
|
// Prevent folding operands backwards in the function. For example,
|
|
|
|
// the COPY opcode must not be replaced by 1 in this example:
|
|
|
|
//
|
2017-12-07 18:40:31 +08:00
|
|
|
// %3 = COPY %vgpr0; VGPR_32:%3
|
2016-01-13 19:44:29 +08:00
|
|
|
// ...
|
2017-12-07 18:40:31 +08:00
|
|
|
// %vgpr0 = V_MOV_B32_e32 1, implicit %exec
|
2016-01-13 19:44:29 +08:00
|
|
|
MachineOperand &Dst = MI.getOperand(0);
|
2020-08-21 00:46:16 +08:00
|
|
|
if (Dst.isReg() && !Dst.getReg().isVirtual())
|
2016-01-13 19:44:29 +08:00
|
|
|
continue;
|
|
|
|
|
2017-01-11 07:32:04 +08:00
|
|
|
foldInstOperand(MI, OpToFold);
|
2014-11-22 06:06:37 +08:00
|
|
|
}
|
|
|
|
}
|
[AMDGPU] Skip additional folding on the same operand.
Reviewers: rampitec, arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69355
2019-10-24 03:19:06 +08:00
|
|
|
return true;
|
2014-11-22 06:06:37 +08:00
|
|
|
}
|