R600/SI: implement indirect adressing for SI

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Tom Stellard <thomas.stellard@amd.com>
llvm-svn: 177277
This commit is contained in:
Christian Konig 2013-03-18 11:34:16 +00:00
parent 4a1b9c3bb9
commit 2989ffcacc
3 changed files with 190 additions and 1 deletions

View File

@ -58,6 +58,11 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
computeRegisterProperties();
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
setOperationAction(ISD::ADD, MVT::i64, Legal);
setOperationAction(ISD::ADD, MVT::i32, Legal);

View File

@ -1149,6 +1149,31 @@ def SI_KILL : InstSI <
} // end mayLoad = 1, mayStore = 1, hasSideEffects = 1
// Uses = [EXEC], Defs = [EXEC]
let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
def SI_INDIRECT_SRC : InstSI <
(outs VReg_32:$dst, SReg_64:$temp),
(ins unknown:$src, VSrc_32:$idx, i32imm:$off),
"SI_INDIRECT_SRC $dst, $temp, $src, $idx, $off",
[]
>;
class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
(outs rc:$dst, SReg_64:$temp),
(ins unknown:$src, VSrc_32:$idx, i32imm:$off, VReg_32:$val),
"SI_INDIRECT_DST $dst, $temp, $src, $idx, $off, $val",
[]
> {
let Constraints = "$src = $dst";
}
def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
} // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0]
} // end IsCodeGenOnly, isPseudo
def : Pat<
@ -1521,4 +1546,48 @@ defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v16i8>;
defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
/********** ====================== **********/
/********** Indirect adressing **********/
/********** ====================== **********/
multiclass SI_INDIRECT_Pattern <RegisterClass rc, ValueType vt,
SI_INDIRECT_DST IndDst> {
// 1. Extract with offset
def : Pat<
(vector_extract (vt rc:$vec),
(i64 (zext (i32 (add VReg_32:$idx, imm:$off))))
),
(f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, imm:$off))
>;
// 2. Extract without offset
def : Pat<
(vector_extract (vt rc:$vec),
(i64 (zext (i32 VReg_32:$idx)))
),
(f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, 0))
>;
// 3. Insert with offset
def : Pat<
(vector_insert (vt rc:$vec), (f32 VReg_32:$val),
(i64 (zext (i32 (add VReg_32:$idx, imm:$off))))
),
(vt (IndDst (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, imm:$off, VReg_32:$val))
>;
// 4. Insert without offset
def : Pat<
(vector_insert (vt rc:$vec), (f32 VReg_32:$val),
(i64 (zext (i32 VReg_32:$idx)))
),
(vt (IndDst (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, 0, VReg_32:$val))
>;
}
defm : SI_INDIRECT_Pattern <VReg_64, v2f32, SI_INDIRECT_DST_V2>;
defm : SI_INDIRECT_Pattern <VReg_128, v4f32, SI_INDIRECT_DST_V4>;
defm : SI_INDIRECT_Pattern <VReg_256, v8f32, SI_INDIRECT_DST_V8>;
defm : SI_INDIRECT_Pattern <VReg_512, v16f32, SI_INDIRECT_DST_V16>;
} // End isSI predicate

View File

@ -66,6 +66,7 @@ private:
static const unsigned SkipThreshold = 12;
static char ID;
const TargetRegisterInfo *TRI;
const TargetInstrInfo *TII;
bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
@ -84,9 +85,14 @@ private:
void Kill(MachineInstr &MI);
void Branch(MachineInstr &MI);
void LoadM0(MachineInstr &MI, MachineInstr *MovRel);
void IndirectSrc(MachineInstr &MI);
void IndirectDst(MachineInstr &MI);
public:
SILowerControlFlowPass(TargetMachine &tm) :
MachineFunctionPass(ID), TII(tm.getInstrInfo()) { }
MachineFunctionPass(ID), TRI(tm.getRegisterInfo()),
TII(tm.getInstrInfo()) { }
virtual bool runOnMachineFunction(MachineFunction &MF);
@ -302,6 +308,104 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
MI.eraseFromParent();
}
void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
MachineBasicBlock::iterator I = MI;
unsigned Save = MI.getOperand(1).getReg();
unsigned Idx = MI.getOperand(3).getReg();
if (AMDGPU::SReg_32RegClass.contains(Idx)) {
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
.addReg(Idx);
MBB.insert(I, MovRel);
MI.eraseFromParent();
return;
}
assert(AMDGPU::SReg_64RegClass.contains(Save));
assert(AMDGPU::VReg_32RegClass.contains(Idx));
// Save the EXEC mask
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
.addReg(AMDGPU::EXEC);
// Read the next variant into VCC (lower 32 bits) <- also loop target
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32_e32), AMDGPU::VCC)
.addReg(Idx);
// Move index from VCC into M0
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
.addReg(AMDGPU::VCC);
// Compare the just read M0 value to all possible Idx values
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
.addReg(AMDGPU::M0)
.addReg(Idx);
// Update EXEC, save the original EXEC value to VCC
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
.addReg(AMDGPU::VCC);
// Do the actual move
MBB.insert(I, MovRel);
// Update EXEC, switch all done bits to 0 and all todo bits to 1
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
.addReg(AMDGPU::EXEC)
.addReg(AMDGPU::VCC);
// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
.addImm(-7)
.addReg(AMDGPU::EXEC);
// Restore EXEC
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
.addReg(Save);
MI.eraseFromParent();
}
void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
unsigned Vec = MI.getOperand(2).getReg();
unsigned Off = MI.getOperand(4).getImm();
MachineInstr *MovRel =
BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
.addReg(TRI->getSubReg(Vec, AMDGPU::sub0) + Off)
.addReg(AMDGPU::M0, RegState::Implicit)
.addReg(Vec, RegState::Implicit);
LoadM0(MI, MovRel);
}
void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
unsigned Off = MI.getOperand(4).getImm();
unsigned Val = MI.getOperand(5).getReg();
MachineInstr *MovRel =
BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
.addReg(TRI->getSubReg(Dst, AMDGPU::sub0) + Off, RegState::Define)
.addReg(Val)
.addReg(AMDGPU::M0, RegState::Implicit)
.addReg(Dst, RegState::Implicit);
LoadM0(MI, MovRel);
}
bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
bool HaveKill = false;
@ -363,6 +467,17 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
case AMDGPU::S_BRANCH:
Branch(MI);
break;
case AMDGPU::SI_INDIRECT_SRC:
IndirectSrc(MI);
break;
case AMDGPU::SI_INDIRECT_DST_V2:
case AMDGPU::SI_INDIRECT_DST_V4:
case AMDGPU::SI_INDIRECT_DST_V8:
case AMDGPU::SI_INDIRECT_DST_V16:
IndirectDst(MI);
break;
}
}
}