forked from OSchip/llvm-project
R600/SI: implement indirect adressing for SI
Signed-off-by: Christian König <christian.koenig@amd.com> Reviewed-by: Tom Stellard <thomas.stellard@amd.com> llvm-svn: 177277
This commit is contained in:
parent
4a1b9c3bb9
commit
2989ffcacc
|
@ -58,6 +58,11 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
|
|||
|
||||
computeRegisterProperties();
|
||||
|
||||
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
|
||||
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
|
||||
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
|
||||
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
|
||||
|
||||
setOperationAction(ISD::ADD, MVT::i64, Legal);
|
||||
setOperationAction(ISD::ADD, MVT::i32, Legal);
|
||||
|
||||
|
|
|
@ -1149,6 +1149,31 @@ def SI_KILL : InstSI <
|
|||
} // end mayLoad = 1, mayStore = 1, hasSideEffects = 1
|
||||
// Uses = [EXEC], Defs = [EXEC]
|
||||
|
||||
let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
|
||||
|
||||
def SI_INDIRECT_SRC : InstSI <
|
||||
(outs VReg_32:$dst, SReg_64:$temp),
|
||||
(ins unknown:$src, VSrc_32:$idx, i32imm:$off),
|
||||
"SI_INDIRECT_SRC $dst, $temp, $src, $idx, $off",
|
||||
[]
|
||||
>;
|
||||
|
||||
class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
|
||||
(outs rc:$dst, SReg_64:$temp),
|
||||
(ins unknown:$src, VSrc_32:$idx, i32imm:$off, VReg_32:$val),
|
||||
"SI_INDIRECT_DST $dst, $temp, $src, $idx, $off, $val",
|
||||
[]
|
||||
> {
|
||||
let Constraints = "$src = $dst";
|
||||
}
|
||||
|
||||
def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
|
||||
def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
|
||||
def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
|
||||
def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
|
||||
|
||||
} // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0]
|
||||
|
||||
} // end IsCodeGenOnly, isPseudo
|
||||
|
||||
def : Pat<
|
||||
|
@ -1521,4 +1546,48 @@ defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
|
|||
defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v16i8>;
|
||||
defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
|
||||
|
||||
/********** ====================== **********/
|
||||
/********** Indirect adressing **********/
|
||||
/********** ====================== **********/
|
||||
|
||||
multiclass SI_INDIRECT_Pattern <RegisterClass rc, ValueType vt,
|
||||
SI_INDIRECT_DST IndDst> {
|
||||
// 1. Extract with offset
|
||||
def : Pat<
|
||||
(vector_extract (vt rc:$vec),
|
||||
(i64 (zext (i32 (add VReg_32:$idx, imm:$off))))
|
||||
),
|
||||
(f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, imm:$off))
|
||||
>;
|
||||
|
||||
// 2. Extract without offset
|
||||
def : Pat<
|
||||
(vector_extract (vt rc:$vec),
|
||||
(i64 (zext (i32 VReg_32:$idx)))
|
||||
),
|
||||
(f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, 0))
|
||||
>;
|
||||
|
||||
// 3. Insert with offset
|
||||
def : Pat<
|
||||
(vector_insert (vt rc:$vec), (f32 VReg_32:$val),
|
||||
(i64 (zext (i32 (add VReg_32:$idx, imm:$off))))
|
||||
),
|
||||
(vt (IndDst (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, imm:$off, VReg_32:$val))
|
||||
>;
|
||||
|
||||
// 4. Insert without offset
|
||||
def : Pat<
|
||||
(vector_insert (vt rc:$vec), (f32 VReg_32:$val),
|
||||
(i64 (zext (i32 VReg_32:$idx)))
|
||||
),
|
||||
(vt (IndDst (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, 0, VReg_32:$val))
|
||||
>;
|
||||
}
|
||||
|
||||
defm : SI_INDIRECT_Pattern <VReg_64, v2f32, SI_INDIRECT_DST_V2>;
|
||||
defm : SI_INDIRECT_Pattern <VReg_128, v4f32, SI_INDIRECT_DST_V4>;
|
||||
defm : SI_INDIRECT_Pattern <VReg_256, v8f32, SI_INDIRECT_DST_V8>;
|
||||
defm : SI_INDIRECT_Pattern <VReg_512, v16f32, SI_INDIRECT_DST_V16>;
|
||||
|
||||
} // End isSI predicate
|
||||
|
|
|
@ -66,6 +66,7 @@ private:
|
|||
static const unsigned SkipThreshold = 12;
|
||||
|
||||
static char ID;
|
||||
const TargetRegisterInfo *TRI;
|
||||
const TargetInstrInfo *TII;
|
||||
|
||||
bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
|
||||
|
@ -84,9 +85,14 @@ private:
|
|||
void Kill(MachineInstr &MI);
|
||||
void Branch(MachineInstr &MI);
|
||||
|
||||
void LoadM0(MachineInstr &MI, MachineInstr *MovRel);
|
||||
void IndirectSrc(MachineInstr &MI);
|
||||
void IndirectDst(MachineInstr &MI);
|
||||
|
||||
public:
|
||||
SILowerControlFlowPass(TargetMachine &tm) :
|
||||
MachineFunctionPass(ID), TII(tm.getInstrInfo()) { }
|
||||
MachineFunctionPass(ID), TRI(tm.getRegisterInfo()),
|
||||
TII(tm.getInstrInfo()) { }
|
||||
|
||||
virtual bool runOnMachineFunction(MachineFunction &MF);
|
||||
|
||||
|
@ -302,6 +308,104 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
|
|||
MI.eraseFromParent();
|
||||
}
|
||||
|
||||
void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
|
||||
|
||||
MachineBasicBlock &MBB = *MI.getParent();
|
||||
DebugLoc DL = MI.getDebugLoc();
|
||||
MachineBasicBlock::iterator I = MI;
|
||||
|
||||
unsigned Save = MI.getOperand(1).getReg();
|
||||
unsigned Idx = MI.getOperand(3).getReg();
|
||||
|
||||
if (AMDGPU::SReg_32RegClass.contains(Idx)) {
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
|
||||
.addReg(Idx);
|
||||
MBB.insert(I, MovRel);
|
||||
MI.eraseFromParent();
|
||||
return;
|
||||
}
|
||||
|
||||
assert(AMDGPU::SReg_64RegClass.contains(Save));
|
||||
assert(AMDGPU::VReg_32RegClass.contains(Idx));
|
||||
|
||||
// Save the EXEC mask
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
|
||||
.addReg(AMDGPU::EXEC);
|
||||
|
||||
// Read the next variant into VCC (lower 32 bits) <- also loop target
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32_e32), AMDGPU::VCC)
|
||||
.addReg(Idx);
|
||||
|
||||
// Move index from VCC into M0
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
|
||||
.addReg(AMDGPU::VCC);
|
||||
|
||||
// Compare the just read M0 value to all possible Idx values
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
|
||||
.addReg(AMDGPU::M0)
|
||||
.addReg(Idx);
|
||||
|
||||
// Update EXEC, save the original EXEC value to VCC
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
|
||||
.addReg(AMDGPU::VCC);
|
||||
|
||||
// Do the actual move
|
||||
MBB.insert(I, MovRel);
|
||||
|
||||
// Update EXEC, switch all done bits to 0 and all todo bits to 1
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
|
||||
.addReg(AMDGPU::EXEC)
|
||||
.addReg(AMDGPU::VCC);
|
||||
|
||||
// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
|
||||
.addImm(-7)
|
||||
.addReg(AMDGPU::EXEC);
|
||||
|
||||
// Restore EXEC
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
|
||||
.addReg(Save);
|
||||
|
||||
MI.eraseFromParent();
|
||||
}
|
||||
|
||||
void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
|
||||
|
||||
MachineBasicBlock &MBB = *MI.getParent();
|
||||
DebugLoc DL = MI.getDebugLoc();
|
||||
|
||||
unsigned Dst = MI.getOperand(0).getReg();
|
||||
unsigned Vec = MI.getOperand(2).getReg();
|
||||
unsigned Off = MI.getOperand(4).getImm();
|
||||
|
||||
MachineInstr *MovRel =
|
||||
BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
|
||||
.addReg(TRI->getSubReg(Vec, AMDGPU::sub0) + Off)
|
||||
.addReg(AMDGPU::M0, RegState::Implicit)
|
||||
.addReg(Vec, RegState::Implicit);
|
||||
|
||||
LoadM0(MI, MovRel);
|
||||
}
|
||||
|
||||
void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
|
||||
|
||||
MachineBasicBlock &MBB = *MI.getParent();
|
||||
DebugLoc DL = MI.getDebugLoc();
|
||||
|
||||
unsigned Dst = MI.getOperand(0).getReg();
|
||||
unsigned Off = MI.getOperand(4).getImm();
|
||||
unsigned Val = MI.getOperand(5).getReg();
|
||||
|
||||
MachineInstr *MovRel =
|
||||
BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
|
||||
.addReg(TRI->getSubReg(Dst, AMDGPU::sub0) + Off, RegState::Define)
|
||||
.addReg(Val)
|
||||
.addReg(AMDGPU::M0, RegState::Implicit)
|
||||
.addReg(Dst, RegState::Implicit);
|
||||
|
||||
LoadM0(MI, MovRel);
|
||||
}
|
||||
|
||||
bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
|
||||
|
||||
bool HaveKill = false;
|
||||
|
@ -363,6 +467,17 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
|
|||
case AMDGPU::S_BRANCH:
|
||||
Branch(MI);
|
||||
break;
|
||||
|
||||
case AMDGPU::SI_INDIRECT_SRC:
|
||||
IndirectSrc(MI);
|
||||
break;
|
||||
|
||||
case AMDGPU::SI_INDIRECT_DST_V2:
|
||||
case AMDGPU::SI_INDIRECT_DST_V4:
|
||||
case AMDGPU::SI_INDIRECT_DST_V8:
|
||||
case AMDGPU::SI_INDIRECT_DST_V16:
|
||||
IndirectDst(MI);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue