Revert "[AMDGPU] SIFixSGPRCopies refactoring"

Breaks ASan tests.

This reverts commit 3f8ae7efa8.
This commit is contained in:
Evgenii Stepanov 2022-08-10 11:31:23 -07:00
parent 23ace05e0a
commit 8ea1cf3111
3 changed files with 276 additions and 236 deletions

View File

@ -120,10 +120,6 @@ public:
class SIFixSGPRCopies : public MachineFunctionPass {
MachineDominatorTree *MDT;
SmallVector<MachineInstr*, 4> SCCCopies;
SmallVector<MachineInstr*, 4> RegSequences;
SmallVector<MachineInstr*, 4> PHINodes;
SmallVector<MachineInstr*, 4> S2VCopies;
unsigned NextVGPRToSGPRCopyID;
DenseMap<unsigned, V2SCopyInfo> V2SCopies;
DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty;
@ -138,11 +134,8 @@ public:
SIFixSGPRCopies() : MachineFunctionPass(ID), NextVGPRToSGPRCopyID(0) {}
bool runOnMachineFunction(MachineFunction &MF) override;
void fixSCCCopies(bool IsWave32);
void prepareRegSequenceAndPHIs(MachineFunction &MF);
unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; }
bool needToBeConvertedToVALU(V2SCopyInfo *I);
void analyzeVGPRToSGPRCopy(MachineInstr *MI);
void analyzeVGPRToSGPRCopy(V2SCopyInfo& Info);
void lowerVGPR2SGPRCopies(MachineFunction &MF);
// Handles copies which source register is:
// 1. Physical register
@ -178,6 +171,19 @@ FunctionPass *llvm::createSIFixSGPRCopiesPass() {
return new SIFixSGPRCopies();
}
static bool hasVectorOperands(const MachineInstr &MI,
const SIRegisterInfo *TRI) {
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
for (const MachineOperand &MO : MI.operands()) {
if (!MO.isReg() || !MO.getReg().isVirtual())
continue;
if (TRI->hasVectorRegisters(MRI.getRegClass(MO.getReg())))
return true;
}
return false;
}
static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
getCopyRegClasses(const MachineInstr &Copy,
const SIRegisterInfo &TRI,
@ -610,6 +616,14 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
TII = ST.getInstrInfo();
MDT = &getAnalysis<MachineDominatorTree>();
// We have to lower VGPR to SGPR copies before the main loop
// because the REG_SEQUENCE and PHI lowering in main loop
// convert the def-use chains to VALU and close the opportunities
// for keeping them scalar.
// TODO: REG_SEQENCE and PHIs are semantically copies. The next patch
// addresses their lowering and unify the processing in one main loop.
lowerVGPR2SGPRCopies(MF);
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {
MachineBasicBlock *MBB = &*BI;
@ -625,66 +639,100 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
case AMDGPU::STRICT_WQM:
case AMDGPU::SOFT_WQM:
case AMDGPU::STRICT_WWM: {
Register SrcReg = MI.getOperand(1).getReg();
Register DstReg = MI.getOperand(0).getReg();
const TargetRegisterClass *SrcRC, *DstRC;
std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);
if (MI.isCopy() && (SrcReg == AMDGPU::SCC || DstReg == AMDGPU::SCC))
SCCCopies.push_back(&MI);
if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
// Since VGPR to SGPR copies affect VGPR to SGPR copy
// score and, hence the lowering decision, let's try to get rid of
// them as early as possible
if (tryChangeVGPRtoSGPRinCopy(MI, TRI, TII))
if (MI.isCopy()) {
Register SrcReg = MI.getOperand(1).getReg();
if (SrcReg == AMDGPU::SCC) {
Register SCCCopy = MRI->createVirtualRegister(
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID));
I = BuildMI(*MI.getParent(),
std::next(MachineBasicBlock::iterator(MI)),
MI.getDebugLoc(),
TII->get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
: AMDGPU::S_CSELECT_B64),
SCCCopy)
.addImm(-1)
.addImm(0);
I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(),
TII->get(AMDGPU::COPY), DstReg)
.addReg(SCCCopy);
MI.eraseFromParent();
continue;
} else if (DstReg == AMDGPU::SCC) {
unsigned Opcode =
ST.isWave64() ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
Register Exec = ST.isWave64() ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
Register Tmp = MRI->createVirtualRegister(TRI->getBoolRC());
I = BuildMI(*MI.getParent(),
std::next(MachineBasicBlock::iterator(MI)),
MI.getDebugLoc(), TII->get(Opcode))
.addReg(Tmp, getDefRegState(true))
.addReg(SrcReg)
.addReg(Exec);
MI.eraseFromParent();
continue;
// Collect those not changed to try them after VGPR to SGPR copies
// lowering as there will be more opportunities.
S2VCopies.push_back(&MI);
}
if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI))
continue;
if (lowerSpecialCase(MI))
continue;
analyzeVGPRToSGPRCopy(&MI);
break;
}
case AMDGPU::INSERT_SUBREG:
case AMDGPU::PHI:
case AMDGPU::REG_SEQUENCE: {
if (TRI->isSGPRClass(TII->getOpRegClass(MI, 0))) {
for (MachineOperand &MO : MI.operands()) {
if (!MO.isReg() || !MO.getReg().isVirtual())
continue;
const TargetRegisterClass *SrcRC = MRI->getRegClass(MO.getReg());
if (TRI->hasVectorRegisters(SrcRC)) {
const TargetRegisterClass *DestRC =
TRI->getEquivalentSGPRClass(SrcRC);
Register NewDst = MRI->createVirtualRegister(DestRC);
MachineBasicBlock *BlockToInsertCopy =
MI.isPHI() ? MI.getOperand(MI.getOperandNo(&MO) + 1).getMBB()
: MBB;
MachineBasicBlock::iterator PointToInsertCopy =
MI.isPHI() ? BlockToInsertCopy->getFirstInstrTerminator() : I;
MachineInstr *NewCopy =
BuildMI(*BlockToInsertCopy, PointToInsertCopy,
PointToInsertCopy->getDebugLoc(),
TII->get(AMDGPU::COPY), NewDst)
.addReg(MO.getReg());
MO.setReg(NewDst);
analyzeVGPRToSGPRCopy(NewCopy);
}
}
}
if (MI.isPHI())
PHINodes.push_back(&MI);
else if (MI.isRegSequence())
RegSequences.push_back(&MI);
if (!DstReg.isVirtual()) {
// If the destination register is a physical register there isn't
// really much we can do to fix this.
// Some special instructions use M0 as an input. Some even only use
// the first lane. Insert a readfirstlane and hope for the best.
if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) {
Register TmpReg
= MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*MBB, MI, MI.getDebugLoc(),
TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg)
.add(MI.getOperand(1));
MI.getOperand(1).setReg(TmpReg);
}
continue;
}
if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
tryChangeVGPRtoSGPRinCopy(MI, TRI, TII);
}
break;
}
case AMDGPU::PHI: {
processPHINode(MI);
break;
}
case AMDGPU::REG_SEQUENCE: {
if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) ||
!hasVectorOperands(MI, TRI)) {
foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI);
continue;
}
break;
}
case AMDGPU::INSERT_SUBREG: {
const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
DstRC = MRI->getRegClass(MI.getOperand(0).getReg());
Src0RC = MRI->getRegClass(MI.getOperand(1).getReg());
Src1RC = MRI->getRegClass(MI.getOperand(2).getReg());
if (TRI->isSGPRClass(DstRC) &&
(TRI->hasVectorRegisters(Src0RC) ||
TRI->hasVectorRegisters(Src1RC))) {
LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
if (NewBB && NewBB != MBB) {
MBB = NewBB;
E = MBB->end();
BI = MachineFunction::iterator(MBB);
BE = MF.end();
}
assert((!NewBB || NewBB == I->getParent()) &&
"moveToVALU did not return the right basic block");
}
break;
}
case AMDGPU::V_WRITELANE_B32: {
@ -752,41 +800,11 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
}
}
lowerVGPR2SGPRCopies(MF);
// Postprocessing
fixSCCCopies(ST.isWave32());
for (auto MI : S2VCopies) {
// Check if it is still valid
if (MI->getParent() && MI->isCopy()) {
const TargetRegisterClass *SrcRC, *DstRC;
std::tie(SrcRC, DstRC) = getCopyRegClasses(*MI, *TRI, *MRI);
if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI))
tryChangeVGPRtoSGPRinCopy(*MI, TRI, TII);
}
}
for (auto MI : RegSequences) {
// Check if it is still valid
if (MI->getParent() && MI->isRegSequence())
foldVGPRCopyIntoRegSequence(*MI, TRI, TII, *MRI);
}
for (auto MI : PHINodes) {
processPHINode(*MI);
}
if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge)
hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII);
SiblingPenalty.clear();
V2SCopies.clear();
SCCCopies.clear();
RegSequences.clear();
PHINodes.clear();
S2VCopies.clear();
return true;
}
@ -843,29 +861,7 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
}
bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI) {
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
if (!DstReg.isVirtual()) {
// If the destination register is a physical register there isn't
// really much we can do to fix this.
// Some special instructions use M0 as an input. Some even only use
// the first lane. Insert a readfirstlane and hope for the best.
if (DstReg == AMDGPU::M0 &&
TRI->hasVectorRegisters(MRI->getRegClass(SrcReg))) {
Register TmpReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg)
.add(MI.getOperand(1));
MI.getOperand(1).setReg(TmpReg);
}
return true;
}
if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) {
TII->moveToVALU(MI, MDT);
return true;
@ -884,13 +880,9 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI) {
return false;
}
void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {
Register DstReg = MI->getOperand(0).getReg();
const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
V2SCopyInfo Info(getNextVGPRToSGPRCopyId(), MI,
TRI->getRegSizeInBits(*DstRC));
void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(V2SCopyInfo& Info) {
SmallVector<MachineInstr *, 8> AnalysisWorklist;
// Needed because the SSA is not a tree but a graph and may have
// forks and joins. We should not then go same way twice.
@ -938,52 +930,143 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {
AnalysisWorklist.push_back(U);
}
}
V2SCopies[Info.ID] = Info;
}
// The main function that computes the VGPR to SGPR copy score
// and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU
bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) {
if (Info->SChain.empty()) {
Info->Score = 0;
return true;
}
Info->Siblings = SiblingPenalty[*std::max_element(
Info->SChain.begin(), Info->SChain.end(),
[&](MachineInstr *A, MachineInstr *B) -> bool {
return SiblingPenalty[A].size() < SiblingPenalty[B].size();
})];
Info->Siblings.remove_if([&](unsigned ID) { return ID == Info->ID; });
// The loop below computes the number of another VGPR to SGPR V2SCopies
// which contribute to the current copy SALU chain. We assume that all the
// V2SCopies with the same source virtual register will be squashed to one
// by regalloc. Also we take care of the V2SCopies of the differnt subregs
// of the same register.
SmallSet<std::pair<Register, unsigned>, 4> SrcRegs;
for (auto J : Info->Siblings) {
auto InfoIt = V2SCopies.find(J);
if (InfoIt != V2SCopies.end()) {
MachineInstr *SiblingCopy = InfoIt->getSecond().Copy;
if (SiblingCopy->isImplicitDef())
// the COPY has already been MoveToVALUed
continue;
SrcRegs.insert(std::make_pair(SiblingCopy->getOperand(1).getReg(),
SiblingCopy->getOperand(1).getSubReg()));
}
}
Info->SiblingPenalty = SrcRegs.size();
unsigned Penalty =
Info->NumSVCopies + Info->SiblingPenalty + Info->NumReadfirstlanes;
unsigned Profit = Info->SChain.size();
Info->Score = Penalty > Profit ? 0 : Profit - Penalty;
Info->NeedToBeConvertedToVALU = Info->Score < 3;
return Info->NeedToBeConvertedToVALU;
}
void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
// The main function that computes the VGPR to SGPR copy score
// and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU
auto needToBeConvertedToVALU = [&](V2SCopyInfo *I) -> bool {
if (I->SChain.empty()) {
I->Score = 0;
return true;
}
I->Siblings = SiblingPenalty[*std::max_element(
I->SChain.begin(), I->SChain.end(),
[&](MachineInstr *A, MachineInstr *B) -> bool {
return SiblingPenalty[A].size() < SiblingPenalty[B].size();
})];
I->Siblings.remove_if([&](unsigned ID) { return ID == I->ID; });
// The loop below computes the number of another VGPR to SGPR V2SCopies
// which contribute to the current copy SALU chain. We assume that all the
// V2SCopies with the same source virtual register will be squashed to one
// by regalloc. Also we take care of the V2SCopies of the differnt subregs
// of the same register.
SmallSet<std::pair<Register, unsigned>, 4> SrcRegs;
for (auto J : I->Siblings) {
auto InfoIt = V2SCopies.find(J);
if (InfoIt != V2SCopies.end()) {
MachineInstr *SiblingCopy = InfoIt->getSecond().Copy;
if (SiblingCopy->isImplicitDef())
// the COPY has already been MoveToVALUed
continue;
SrcRegs.insert(std::make_pair(SiblingCopy->getOperand(1).getReg(),
SiblingCopy->getOperand(1).getSubReg()));
}
}
I->SiblingPenalty = SrcRegs.size();
unsigned Penalty =
I->NumSVCopies + I->SiblingPenalty + I->NumReadfirstlanes;
unsigned Profit = I->SChain.size();
I->Score = Penalty > Profit ? 0 : Profit - Penalty;
I->NeedToBeConvertedToVALU = I->Score < 3;
return I->NeedToBeConvertedToVALU;
};
auto needProcessing = [](MachineInstr &MI) -> bool {
switch (MI.getOpcode()) {
case AMDGPU::COPY:
case AMDGPU::WQM:
case AMDGPU::STRICT_WQM:
case AMDGPU::SOFT_WQM:
case AMDGPU::STRICT_WWM:
case AMDGPU::REG_SEQUENCE:
case AMDGPU::PHI:
return true;
default:
return false;
}
};
SmallSet<MachineInstr *, 4> OutOfOrderProcessedCopies;
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
++BI) {
MachineBasicBlock *MBB = &*BI;
for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
++I) {
MachineInstr *MI = &*I;
if (!needProcessing(*MI))
continue;
if (MI->isRegSequence() || MI->isPHI()) {
MachineBasicBlock::iterator J = I;
if (TRI->isSGPRClass(TII->getOpRegClass(*MI, 0))) {
for (MachineOperand &MO : MI->operands()) {
if (!MO.isReg() || !MO.getReg().isVirtual())
continue;
const TargetRegisterClass *SrcRC = MRI->getRegClass(MO.getReg());
if (TRI->hasVectorRegisters(SrcRC)) {
const TargetRegisterClass *DestRC =
TRI->getEquivalentSGPRClass(SrcRC);
Register NewDst = MRI->createVirtualRegister(DestRC);
MachineBasicBlock *BlockToInsertCopy = MBB;
MachineBasicBlock::iterator PointToInsertCopy = I;
if (MI->isPHI()) {
BlockToInsertCopy =
MI->getOperand(MI->getOperandNo(&MO) + 1).getMBB();
PointToInsertCopy =
BlockToInsertCopy->getFirstInstrTerminator();
}
MachineBasicBlock::iterator NewI =
BuildMI(*BlockToInsertCopy, PointToInsertCopy,
PointToInsertCopy->getDebugLoc(),
TII->get(AMDGPU::COPY), NewDst)
.addReg(MO.getReg());
MO.setReg(NewDst);
if (!MI->isPHI()) {
I = NewI;
MI = &*I;
} else {
// We insert the copy into the basic block that may have been
// already processed. Pass it to the analysis explicitly.
V2SCopyInfo In(getNextVGPRToSGPRCopyId(), MI,
TRI->getRegSizeInBits(*DestRC));
analyzeVGPRToSGPRCopy(In);
V2SCopies[In.ID] = In;
OutOfOrderProcessedCopies.insert(MI);
}
}
}
}
if (J == I)
continue;
}
const TargetRegisterClass *SrcRC, *DstRC;
std::tie(SrcRC, DstRC) = getCopyRegClasses(*MI, *TRI, *MRI);
if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI))
continue;
if (lowerSpecialCase(*MI))
continue;
if (OutOfOrderProcessedCopies.contains(MI))
continue;
V2SCopyInfo In(getNextVGPRToSGPRCopyId(), MI,
TRI->getRegSizeInBits(*DstRC));
analyzeVGPRToSGPRCopy(In);
V2SCopies[In.ID] = In;
}
}
SmallVector<unsigned, 8> LoweringWorklist;
for (auto &C : V2SCopies) {
if (needToBeConvertedToVALU(&C.second))
@ -1059,46 +1142,3 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
MI->eraseFromParent();
}
}
void SIFixSGPRCopies::fixSCCCopies(bool IsWave32) {
for (auto MI : SCCCopies) {
// May be lowered out
if (!MI->getParent())
continue;
// May already have been lowered.
if (!MI->isCopy())
continue;
Register SrcReg = MI->getOperand(1).getReg();
Register DstReg = MI->getOperand(0).getReg();
if (SrcReg == AMDGPU::SCC) {
Register SCCCopy = MRI->createVirtualRegister(
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID));
MachineBasicBlock::iterator I =
BuildMI(*MI->getParent(),
std::next(MachineBasicBlock::iterator(MI)),
MI->getDebugLoc(),
TII->get(IsWave32 ? AMDGPU::S_CSELECT_B32
: AMDGPU::S_CSELECT_B64),
SCCCopy)
.addImm(-1)
.addImm(0);
BuildMI(*MI->getParent(), std::next(I), I->getDebugLoc(),
TII->get(AMDGPU::COPY), DstReg)
.addReg(SCCCopy);
MI->eraseFromParent();
continue;
}
if (DstReg == AMDGPU::SCC) {
unsigned Opcode = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
Register Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
Register Tmp = MRI->createVirtualRegister(TRI->getBoolRC());
BuildMI(*MI->getParent(), std::next(MachineBasicBlock::iterator(MI)),
MI->getDebugLoc(), TII->get(Opcode))
.addReg(Tmp, getDefRegState(true))
.addReg(SrcReg)
.addReg(Exec);
MI->eraseFromParent();
}
}
}

View File

@ -41,9 +41,9 @@ body: |
; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; W64-NEXT: {{ $}}
; W64-NEXT: .1:
@ -88,9 +88,9 @@ body: |
; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; W32-NEXT: {{ $}}
; W32-NEXT: .1:
@ -160,10 +160,10 @@ body: |
; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3
; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; W64-NEXT: {{ $}}
; W64-NEXT: .1:
@ -207,10 +207,10 @@ body: |
; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3
; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; W32-NEXT: {{ $}}
; W32-NEXT: .1:
@ -280,10 +280,10 @@ body: |
; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3
; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; W64-NEXT: {{ $}}
; W64-NEXT: .1:
@ -327,10 +327,10 @@ body: |
; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3
; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; W32-NEXT: {{ $}}
; W32-NEXT: .1:
@ -400,9 +400,9 @@ body: |
; ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
; ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
; ADDR64-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
; ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
; ADDR64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
@ -429,9 +429,9 @@ body: |
; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
; W32-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
; W32-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
@ -485,9 +485,9 @@ body: |
; ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
; ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
; ADDR64-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
; ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
; ADDR64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
@ -513,9 +513,9 @@ body: |
; W64-NO-ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W64-NO-ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W64-NO-ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
; W64-NO-ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
; W64-NO-ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
; W64-NO-ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
; W64-NO-ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; W64-NO-ADDR64-NEXT: {{ $}}
; W64-NO-ADDR64-NEXT: .1:
@ -560,9 +560,9 @@ body: |
; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; W32-NEXT: {{ $}}
; W32-NEXT: .1:

View File

@ -433,7 +433,7 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
; GFX9-O0-NEXT: ; implicit-def: $sgpr34
; GFX9-O0-NEXT: ; implicit-def: $sgpr34
; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
@ -585,7 +585,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: ; implicit-def: $sgpr40
; GFX9-O0-NEXT: ; implicit-def: $sgpr40
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9
; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[40:41], v2, v4
@ -722,7 +722,7 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10
; GFX9-O0-NEXT: ; implicit-def: $sgpr35
; GFX9-O0-NEXT: ; implicit-def: $sgpr35
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
; GFX9-O0-NEXT: s_mov_b32 s35, 0x7fffffff
@ -741,7 +741,7 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12
; GFX9-O0-NEXT: ; implicit-def: $sgpr35
; GFX9-O0-NEXT: ; implicit-def: $sgpr35
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
@ -770,7 +770,7 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O0-NEXT: ; implicit-def: $sgpr35
; GFX9-O0-NEXT: ; implicit-def: $sgpr35
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec
; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $exec
; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $exec
; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $exec
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11