From a99ada528c19987ae94278584bb7d4856c7ce826 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Fri, 21 Nov 2014 22:31:44 +0000 Subject: [PATCH] R600/SI: Emit s_mov_b32 m0, -1 before every DS instruction This s_mov_b32 will write to a virtual register from the M0Reg class and all the ds instructions now take an extra M0Reg explicit argument. This change is necessary to prevent issues with the scheduler mixing together instructions that expect different values in the m0 registers. llvm-svn: 222583 --- llvm/lib/Target/R600/SIISelLowering.cpp | 2 +- llvm/lib/Target/R600/SIInstrFormats.td | 1 + llvm/lib/Target/R600/SIInstrInfo.td | 17 +++++++------- llvm/lib/Target/R600/SIInstructions.td | 15 ++++++------ llvm/lib/Target/R600/SILoadStoreOptimizer.cpp | 10 +++++++- llvm/lib/Target/R600/SILowerControlFlow.cpp | 23 ------------------- llvm/test/CodeGen/R600/shl_add_ptr.ll | 3 ++- 7 files changed, 30 insertions(+), 41 deletions(-) diff --git a/llvm/lib/Target/R600/SIISelLowering.cpp b/llvm/lib/Target/R600/SIISelLowering.cpp index 8d4164a1c397..fb45684e4a4a 100644 --- a/llvm/lib/Target/R600/SIISelLowering.cpp +++ b/llvm/lib/Target/R600/SIISelLowering.cpp @@ -1986,6 +1986,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, const SIInstrInfo *TII = static_cast( getTargetMachine().getSubtargetImpl()->getInstrInfo()); + MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); TII->legalizeOperands(MI); if (TII->isMIMG(MI->getOpcode())) { @@ -2005,7 +2006,6 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); MI->setDesc(TII->get(NewOpcode)); - MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); MRI.setRegClass(VReg, RC); return; } diff --git a/llvm/lib/Target/R600/SIInstrFormats.td b/llvm/lib/Target/R600/SIInstrFormats.td index 10e0a3f0c13e..ee1a52b2f8f3 100644 --- a/llvm/lib/Target/R600/SIInstrFormats.td +++ b/llvm/lib/Target/R600/SIInstrFormats.td @@ -546,6 +546,7 @@ class DS op, dag outs, dag ins, string asm, list pattern> : let LGKM_CNT = 1; let UseNamedOperandTable = 1; + let DisableEncoding = "$m0"; } class MUBUF op, dag outs, dag ins, string asm, list pattern> : diff --git a/llvm/lib/Target/R600/SIInstrInfo.td b/llvm/lib/Target/R600/SIInstrInfo.td index 713e84edefd2..392c272a8635 100644 --- a/llvm/lib/Target/R600/SIInstrInfo.td +++ b/llvm/lib/Target/R600/SIInstrInfo.td @@ -948,7 +948,7 @@ class DS_1A op, dag outs, dag ins, string asm, list pat> : class DS_Load_Helper op, string asm, RegisterClass regClass> : DS_1A < op, (outs regClass:$vdst), - (ins i1imm:$gds, VReg_32:$addr, ds_offset:$offset), + (ins i1imm:$gds, VReg_32:$addr, ds_offset:$offset, M0Reg:$m0), asm#" $vdst, $addr"#"$offset"#" [M0]", []> { let data0 = 0; @@ -960,7 +960,8 @@ class DS_Load_Helper op, string asm, RegisterClass regClass> : DS_1A < class DS_Load2_Helper op, string asm, RegisterClass regClass> : DS < op, (outs regClass:$vdst), - (ins i1imm:$gds, VReg_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1), + (ins i1imm:$gds, VReg_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1, + M0Reg:$m0), asm#" $vdst, $addr"#"$offset0"#"$offset1 [M0]", []> { let data0 = 0; @@ -973,7 +974,7 @@ class DS_Load2_Helper op, string asm, RegisterClass regClass> : DS < class DS_Store_Helper op, string asm, RegisterClass regClass> : DS_1A < op, (outs), - (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, ds_offset:$offset), + (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, ds_offset:$offset, M0Reg:$m0), asm#" $addr, $data0"#"$offset"#" [M0]", []> { let data1 = 0; @@ -986,7 +987,7 @@ class DS_Store2_Helper op, string asm, RegisterClass regClass> : DS < op, (outs), (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, regClass:$data1, - ds_offset0:$offset0, ds_offset1:$offset1), + ds_offset0:$offset0, ds_offset1:$offset1, M0Reg:$m0), asm#" $addr, $data0, $data1"#"$offset0"#"$offset1 [M0]", []> { let mayStore = 1; @@ -999,7 +1000,7 @@ class DS_Store2_Helper op, string asm, RegisterClass regClass> : DS < class DS_1A1D_RET op, string asm, RegisterClass rc, string noRetOp = ""> : DS_1A < op, (outs rc:$vdst), - (ins i1imm:$gds, VReg_32:$addr, rc:$data0, ds_offset:$offset), + (ins i1imm:$gds, VReg_32:$addr, rc:$data0, ds_offset:$offset, M0Reg:$m0), asm#" $vdst, $addr, $data0"#"$offset"#" [M0]", []>, AtomicNoRet { @@ -1014,7 +1015,7 @@ class DS_1A1D_RET op, string asm, RegisterClass rc, string noRetOp = "" class DS_1A2D_RET op, string asm, RegisterClass rc, string noRetOp = ""> : DS_1A < op, (outs rc:$vdst), - (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset), + (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset, M0Reg:$m0), asm#" $vdst, $addr, $data0, $data1"#"$offset"#" [M0]", []>, AtomicNoRet { @@ -1027,7 +1028,7 @@ class DS_1A2D_RET op, string asm, RegisterClass rc, string noRetOp = "" class DS_1A2D_NORET op, string asm, RegisterClass rc, string noRetOp = asm> : DS_1A < op, (outs), - (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset), + (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset, M0Reg:$m0), asm#" $addr, $data0, $data1"#"$offset"#" [M0]", []>, AtomicNoRet { @@ -1039,7 +1040,7 @@ class DS_1A2D_NORET op, string asm, RegisterClass rc, string noRetOp = class DS_1A1D_NORET op, string asm, RegisterClass rc, string noRetOp = asm> : DS_1A < op, (outs), - (ins i1imm:$gds, VReg_32:$addr, rc:$data0, ds_offset:$offset), + (ins i1imm:$gds, VReg_32:$addr, rc:$data0, ds_offset:$offset, M0Reg:$m0), asm#" $addr, $data0"#"$offset"#" [M0]", []>, AtomicNoRet { diff --git a/llvm/lib/Target/R600/SIInstructions.td b/llvm/lib/Target/R600/SIInstructions.td index bd91577a8319..e1eb95580ac0 100644 --- a/llvm/lib/Target/R600/SIInstructions.td +++ b/llvm/lib/Target/R600/SIInstructions.td @@ -2614,7 +2614,7 @@ def : ROTRPattern ; class DSReadPat : Pat < (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), - (inst (i1 0), $ptr, (as_i16imm $offset)) + (inst (i1 0), $ptr, (as_i16imm $offset), (S_MOV_B32 -1)) >; def : DSReadPat ; @@ -2632,12 +2632,12 @@ def : DSReadPat ; def : Pat < (v2i32 (local_load (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))), - (DS_READ2_B32 (i1 0), $ptr, $offset0, $offset1) + (DS_READ2_B32 (i1 0), $ptr, $offset0, $offset1, (S_MOV_B32 -1)) >; class DSWritePat : Pat < (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), - (inst (i1 0), $ptr, $value, (as_i16imm $offset)) + (inst (i1 0), $ptr, $value, (as_i16imm $offset), (S_MOV_B32 -1)) >; def : DSWritePat ; @@ -2653,12 +2653,13 @@ def : Pat < (local_store v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)), (DS_WRITE2_B32 (i1 0), $ptr, (EXTRACT_SUBREG $value, sub0), - (EXTRACT_SUBREG $value, sub1), $offset0, $offset1) + (EXTRACT_SUBREG $value, sub1), $offset0, $offset1, + (S_MOV_B32 -1)) >; class DSAtomicRetPat : Pat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), - (inst (i1 0), $ptr, $value, (as_i16imm $offset)) + (inst (i1 0), $ptr, $value, (as_i16imm $offset), (S_MOV_B32 -1)) >; // Special case of DSAtomicRetPat for add / sub 1 -> inc / dec @@ -2674,13 +2675,13 @@ class DSAtomicRetPat : Pat < class DSAtomicIncRetPat : Pat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)), - (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset)) + (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (S_MOV_B32 -1)) >; class DSAtomicCmpXChg : Pat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), - (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset)) + (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset), (S_MOV_B32 -1)) >; diff --git a/llvm/lib/Target/R600/SILoadStoreOptimizer.cpp b/llvm/lib/Target/R600/SILoadStoreOptimizer.cpp index 4140196e7522..a092bcc2dafd 100644 --- a/llvm/lib/Target/R600/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/R600/SILoadStoreOptimizer.cpp @@ -222,6 +222,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( // Be careful, since the addresses could be subregisters themselves in weird // cases, like vectors of pointers. const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); + const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0); unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg(); unsigned DestReg1 @@ -262,6 +263,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( .addOperand(*AddrReg) // addr .addImm(NewOffset0) // offset0 .addImm(NewOffset1) // offset1 + .addOperand(*M0Reg) // M0 .addMemOperand(*I->memoperands_begin()) .addMemOperand(*Paired->memoperands_begin()); @@ -280,6 +282,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg()); LIS->shrinkToUses(&AddrRegLI); + LiveInterval &M0RegLI = LIS->getInterval(M0Reg->getReg()); + LIS->shrinkToUses(&M0RegLI); + LIS->getInterval(DestReg); // Create new LI DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); @@ -295,6 +300,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( // Be sure to use .addOperand(), and not .addReg() with these. We want to be // sure we preserve the subregister index and any register flags set on them. const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr); + const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0); const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0); const MachineOperand *Data1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0); @@ -333,11 +339,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( .addOperand(*Data1) // data1 .addImm(NewOffset0) // offset0 .addImm(NewOffset1) // offset1 + .addOperand(*M0Reg) // m0 .addMemOperand(*I->memoperands_begin()) .addMemOperand(*Paired->memoperands_begin()); // XXX - How do we express subregisters here? - unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() }; + unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg(), + M0Reg->getReg()}; LIS->RemoveMachineInstrFromMaps(I); LIS->RemoveMachineInstrFromMaps(Paired); diff --git a/llvm/lib/Target/R600/SILowerControlFlow.cpp b/llvm/lib/Target/R600/SILowerControlFlow.cpp index 9702565c4625..20e8cecdd299 100644 --- a/llvm/lib/Target/R600/SILowerControlFlow.cpp +++ b/llvm/lib/Target/R600/SILowerControlFlow.cpp @@ -88,7 +88,6 @@ private: void Kill(MachineInstr &MI); void Branch(MachineInstr &MI); - void InitM0ForLDS(MachineBasicBlock::iterator MI); void LoadM0(MachineInstr &MI, MachineInstr *MovRel); void IndirectSrc(MachineInstr &MI); void IndirectDst(MachineInstr &MI); @@ -325,14 +324,6 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) { MI.eraseFromParent(); } -/// The m0 register stores the maximum allowable address for LDS reads and -/// writes. Its value must be at least the size in bytes of LDS allocated by -/// the shader. For simplicity, we set it to the maximum possible value. -void SILowerControlFlowPass::InitM0ForLDS(MachineBasicBlock::iterator MI) { - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), - AMDGPU::M0).addImm(0xffffffff); -} - void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) { MachineBasicBlock &MBB = *MI.getParent(); @@ -391,12 +382,6 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) { .addReg(Save); } - // FIXME: Are there any values other than the LDS address clamp that need to - // be stored in the m0 register and may be live for more than a few - // instructions? If so, we should save the m0 register at the beginning - // of this function and restore it here. - // FIXME: Add support for LDS direct loads. - InitM0ForLDS(&MI); MI.eraseFromParent(); } @@ -465,7 +450,6 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { MachineInstr &MI = *I; if (TII->isDS(MI.getOpcode())) { - NeedM0 = true; NeedWQM = true; } @@ -544,13 +528,6 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { } } - if (NeedM0) { - MachineBasicBlock &MBB = MF.front(); - // Initialize M0 to a value that won't cause LDS access to be discarded - // due to offset clamping - InitM0ForLDS(MBB.getFirstNonPHI()); - } - if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) { MachineBasicBlock &MBB = MF.front(); BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64), diff --git a/llvm/test/CodeGen/R600/shl_add_ptr.ll b/llvm/test/CodeGen/R600/shl_add_ptr.ll index 047cf252e78a..fdb3d3908839 100644 --- a/llvm/test/CodeGen/R600/shl_add_ptr.ll +++ b/llvm/test/CodeGen/R600/shl_add_ptr.ll @@ -68,7 +68,8 @@ define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3) ; pointer can be used with an offset into the second one. ; SI-LABEL: {{^}}load_shl_base_lds_2: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: s_mov_b32 m0, -1 +; SI-NEXT: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9 [M0] ; SI: s_endpgm define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {