[AMDGPU] SIFoldOperands: clean up tryConstantFoldOp

First clean up the strange API of tryConstantFoldOp where it took an
immediate operand value, but no indication of which operand it was the
value for.

Second clean up the loop that calls tryConstantFoldOp so that it does
not have to restart from the beginning every time it folds an
instruction.

This is NFCI but there are some minor changes caused by the order in
which things are folded.

Differential Revision: https://reviews.llvm.org/D100031
This commit is contained in:
Jay Foad 2021-04-07 13:49:07 +01:00
parent 65cd0d6be4
commit 7c706af03b
2 changed files with 88 additions and 93 deletions

View File

@ -1046,14 +1046,19 @@ static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
// Try to simplify operations with a constant that may appear after instruction
// selection.
// TODO: See if a frame index with a fixed offset can fold.
static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
const SIInstrInfo *TII,
MachineInstr *MI,
MachineOperand *ImmOp) {
static bool tryConstantFoldOp(MachineRegisterInfo &MRI, const SIInstrInfo *TII,
MachineInstr *MI) {
unsigned Opc = MI->getOpcode();
if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
Opc == AMDGPU::S_NOT_B32) {
MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm());
int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
if (Src0Idx == -1)
return false;
MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
Opc == AMDGPU::S_NOT_B32) &&
Src0->isImm()) {
MI->getOperand(1).ChangeToImmediate(~Src0->getImm());
mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
return true;
}
@ -1061,9 +1066,6 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
if (Src1Idx == -1)
return false;
int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx));
if (!Src0->isImm() && !Src1->isImm())
@ -1195,19 +1197,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
SmallVector<FoldCandidate, 4> FoldList;
MachineOperand &Dst = MI.getOperand(0);
bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
if (FoldingImm) {
unsigned NumLiteralUses = 0;
MachineOperand *NonInlineUse = nullptr;
int NonInlineUseOpNo = -1;
bool Again;
do {
Again = false;
for (auto &Use : make_early_inc_range(MRI->use_nodbg_operands(Dst.getReg()))) {
MachineInstr *UseMI = Use.getParent();
unsigned OpNo = UseMI->getOperandNo(&Use);
if (OpToFold.isImm()) {
for (auto &UseMI :
make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
// Folding the immediate may reveal operations that can be constant
// folded or replaced with a copy. This can happen for example after
// frame indices are lowered to constants or from splitting 64-bit
@ -1216,18 +1208,21 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
// We may also encounter cases where one or both operands are
// immediates materialized into a register, which would ordinarily not
// be folded due to multiple uses or operand constraints.
if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) {
LLVM_DEBUG(dbgs() << "Constant folded " << *UseMI);
// Some constant folding cases change the same immediate's use to a new
// instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
// again. The same constant folded instruction could also have a second
// use operand.
FoldList.clear();
Again = true;
break;
if (tryConstantFoldOp(*MRI, TII, &UseMI))
LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
}
}
bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
if (FoldingImm) {
unsigned NumLiteralUses = 0;
MachineOperand *NonInlineUse = nullptr;
int NonInlineUseOpNo = -1;
for (auto &Use :
make_early_inc_range(MRI->use_nodbg_operands(Dst.getReg()))) {
MachineInstr *UseMI = Use.getParent();
unsigned OpNo = UseMI->getOperandNo(&Use);
// Try to fold any inline immediate uses, and then only fold other
// constants if they have one use.
@ -1247,8 +1242,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
} else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) {
foldOperand(OpToFold, UseMI, OpNo, FoldList,
CopiesToReplace);
foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
} else {
if (++NumLiteralUses == 1) {
NonInlineUse = &Use;
@ -1256,7 +1250,6 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
}
}
}
} while (Again);
if (NumLiteralUses == 1) {
MachineInstr *UseMI = NonInlineUse->getParent();

View File

@ -3211,8 +3211,9 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
; GFX6-LABEL: v_fshr_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX6-NEXT: v_and_b32_e32 v4, v4, v6
; GFX6-NEXT: s_mov_b32 s5, 0xffff
; GFX6-NEXT: v_or_b32_e32 v4, v5, v4
; GFX6-NEXT: v_and_b32_e32 v5, s5, v2
@ -3228,24 +3229,24 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v5, s6, v5
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2
; GFX6-NEXT: v_and_b32_e32 v6, 15, v4
; GFX6-NEXT: v_and_b32_e32 v7, 15, v4
; GFX6-NEXT: v_or_b32_e32 v1, v1, v5
; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4
; GFX6-NEXT: v_and_b32_e32 v4, 15, v4
; GFX6-NEXT: v_and_b32_e32 v2, s5, v2
; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16
; GFX6-NEXT: v_and_b32_e32 v2, v2, v6
; GFX6-NEXT: v_bfe_u32 v7, v7, 0, 16
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2
; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16
; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v7, v0
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: v_and_b32_e32 v2, 15, v5
; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5
; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1
; GFX6-NEXT: v_and_b32_e32 v2, s5, v3
; GFX6-NEXT: v_and_b32_e32 v2, v3, v6
; GFX6-NEXT: v_and_b32_e32 v4, 15, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2
; GFX6-NEXT: v_bfe_u32 v3, v4, 0, 16
@ -4137,7 +4138,7 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v8
; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8
; GFX6-NEXT: v_and_b32_e32 v8, 15, v8
; GFX6-NEXT: v_and_b32_e32 v4, s5, v4
; GFX6-NEXT: v_and_b32_e32 v4, v4, v12
; GFX6-NEXT: v_bfe_u32 v11, v11, 0, 16
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4
; GFX6-NEXT: v_bfe_u32 v8, v8, 0, 16
@ -4149,18 +4150,18 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5
; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1
; GFX6-NEXT: v_and_b32_e32 v4, s5, v5
; GFX6-NEXT: v_and_b32_e32 v4, v5, v12
; GFX6-NEXT: v_and_b32_e32 v8, 15, v8
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4
; GFX6-NEXT: v_bfe_u32 v5, v8, 0, 16
; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4
; GFX6-NEXT: v_or_b32_e32 v1, v1, v4
; GFX6-NEXT: v_and_b32_e32 v4, s5, v6
; GFX6-NEXT: v_and_b32_e32 v4, v6, v12
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v2, s4, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v4, s6, v4
; GFX6-NEXT: v_or_b32_e32 v2, v2, v4
; GFX6-NEXT: v_and_b32_e32 v4, s5, v7
; GFX6-NEXT: v_and_b32_e32 v4, v7, v12
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v3, s4, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v4, s6, v4
@ -4172,7 +4173,7 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6
; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6
; GFX6-NEXT: v_and_b32_e32 v6, 15, v6
; GFX6-NEXT: v_and_b32_e32 v4, s5, v4
; GFX6-NEXT: v_and_b32_e32 v4, v4, v12
; GFX6-NEXT: v_bfe_u32 v8, v8, 0, 16
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4
; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16
@ -4183,7 +4184,7 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7
; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3
; GFX6-NEXT: v_and_b32_e32 v4, s5, v5
; GFX6-NEXT: v_and_b32_e32 v4, v5, v12
; GFX6-NEXT: v_and_b32_e32 v6, 15, v6
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4
; GFX6-NEXT: v_bfe_u32 v5, v6, 0, 16
@ -4205,21 +4206,21 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
; GFX8-NEXT: v_lshrrev_b16_e32 v8, 14, v8
; GFX8-NEXT: v_or_b32_e32 v0, v0, v8
; GFX8-NEXT: v_lshlrev_b16_e32 v8, 1, v2
; GFX8-NEXT: v_and_b32_e32 v10, 15, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v4
; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_and_b32_e32 v9, 15, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v4
; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4
; GFX8-NEXT: v_and_b32_e32 v4, 15, v4
; GFX8-NEXT: v_lshrrev_b16_e32 v8, 1, v8
; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_lshlrev_b16_e32 v6, v9, v6
; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v8
; GFX8-NEXT: v_xor_b32_e32 v8, -1, v9
; GFX8-NEXT: v_lshlrev_b16_e32 v6, v10, v6
; GFX8-NEXT: v_or_b32_e32 v4, v6, v4
; GFX8-NEXT: v_and_b32_e32 v6, 15, v9
; GFX8-NEXT: v_and_b32_e32 v8, 15, v8
; GFX8-NEXT: v_and_b32_e32 v6, 15, v7
; GFX8-NEXT: v_xor_b32_e32 v7, -1, v7
; GFX8-NEXT: v_and_b32_e32 v7, 15, v7
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2
; GFX8-NEXT: v_lshlrev_b16_e32 v0, v6, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v2, v8, v2
; GFX8-NEXT: v_lshrrev_b16_e32 v2, v7, v2
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v2, 16
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
@ -4228,23 +4229,24 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
; GFX8-NEXT: v_lshlrev_b16_e32 v4, 1, v1
; GFX8-NEXT: v_lshrrev_b16_e32 v6, 14, v6
; GFX8-NEXT: v_or_b32_e32 v4, v4, v6
; GFX8-NEXT: v_lshrrev_b16_sdwa v6, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_mov_b32_e32 v6, 1
; GFX8-NEXT: v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5
; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_lshrrev_b16_e32 v6, 14, v6
; GFX8-NEXT: v_or_b32_e32 v1, v1, v6
; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v3
; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_lshrrev_b16_e32 v7, 14, v7
; GFX8-NEXT: v_or_b32_e32 v1, v1, v7
; GFX8-NEXT: v_lshlrev_b16_e32 v7, 1, v3
; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_and_b32_e32 v8, 15, v5
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v5
; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5
; GFX8-NEXT: v_and_b32_e32 v5, 15, v5
; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v6
; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v6
; GFX8-NEXT: v_lshrrev_b16_e32 v7, 1, v7
; GFX8-NEXT: v_lshlrev_b16_e32 v4, v8, v4
; GFX8-NEXT: v_xor_b32_e32 v6, -1, v7
; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v7
; GFX8-NEXT: v_or_b32_e32 v4, v4, v5
; GFX8-NEXT: v_and_b32_e32 v5, 15, v7
; GFX8-NEXT: v_and_b32_e32 v5, 15, v6
; GFX8-NEXT: v_xor_b32_e32 v6, -1, v6
; GFX8-NEXT: v_and_b32_e32 v6, 15, v6
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3
; GFX8-NEXT: v_lshlrev_b16_e32 v1, v5, v1