forked from OSchip/llvm-project
Revert "[amdgpu] Lower SGPR-to-VGPR copy in the final phase of ISel."
This reverts commitc3492a1aa1
. I think this is the wrong strategy and wrong place to do this transform anyway. Also reverts follow up commit7d593d0d69
.
This commit is contained in:
parent
455ca0ebb6
commit
27df165270
|
@ -1244,11 +1244,6 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
|
|||
foldOperand(OpToFold, UseMI, OpNo, FoldList,
|
||||
CopiesToReplace);
|
||||
} else {
|
||||
// Skip updating literal use if it's used in the same REQ_SQUENCE as,
|
||||
// if that literal could be inlined, it's just a single use.
|
||||
if (NonInlineUse && NonInlineUse->getParent() == UseMI &&
|
||||
UseMI->isRegSequence())
|
||||
continue;
|
||||
if (++NumLiteralUses == 1) {
|
||||
NonInlineUse = &*Use;
|
||||
NonInlineUseOpNo = OpNo;
|
||||
|
|
|
@ -102,10 +102,6 @@ static cl::opt<bool> UseDivergentRegisterIndexing(
|
|||
cl::desc("Use indirect register addressing for divergent indexes"),
|
||||
cl::init(false));
|
||||
|
||||
static cl::opt<bool> EnableLowerSGPRToVGPRCopy(
|
||||
"lower-sgpr-to-vgpr-copy", cl::Hidden,
|
||||
cl::desc("Enable lowering copy from SGPR to VGPR"), cl::init(true));
|
||||
|
||||
static bool hasFP32Denormals(const MachineFunction &MF) {
|
||||
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
|
||||
return Info->getMode().allFP32Denormals();
|
||||
|
@ -11484,60 +11480,6 @@ bool SITargetLowering::checkAsmConstraintValA(SDValue Op,
|
|||
return false;
|
||||
}
|
||||
|
||||
// Lower COPY from SGPR to VGPR to real one as they are real transfer instead
|
||||
// of COPY.
|
||||
static void lowerSGPRToVGPRCopy(MachineFunction &MF, MachineRegisterInfo &MRI,
|
||||
const SIRegisterInfo &TRI,
|
||||
const SIInstrInfo &TII) {
|
||||
for (MachineBasicBlock &MBB : MF) {
|
||||
for (auto BI = MBB.begin(), BE = MBB.end(); BI != BE; /*EMPTY*/) {
|
||||
MachineInstr &MI = *BI++;
|
||||
|
||||
auto IsSGPRToVGPRCopy = [&MRI, &TRI](const MachineInstr &MI) {
|
||||
if (!MI.isCopy())
|
||||
return false;
|
||||
|
||||
auto DstReg = MI.getOperand(0).getReg();
|
||||
auto SrcReg = MI.getOperand(1).getReg();
|
||||
const auto *DstRC = DstReg.isVirtual() ? MRI.getRegClass(DstReg)
|
||||
: TRI.getPhysRegClass(DstReg);
|
||||
const auto *SrcRC = SrcReg.isVirtual() ? MRI.getRegClass(SrcReg)
|
||||
: TRI.getPhysRegClass(SrcReg);
|
||||
return (DstRC == &AMDGPU::VGPR_32RegClass ||
|
||||
DstRC == &AMDGPU::VReg_64RegClass) &&
|
||||
(SrcRC == &AMDGPU::SGPR_32RegClass ||
|
||||
SrcRC == &AMDGPU::SGPR_64RegClass);
|
||||
};
|
||||
|
||||
// Skip if it's not a copy from SGPR to VGPR.
|
||||
if (!IsSGPRToVGPRCopy(MI))
|
||||
continue;
|
||||
|
||||
const MachineOperand &Src = MI.getOperand(1);
|
||||
// FIXME: Need subreg support.
|
||||
if (Src.getSubReg() != AMDGPU::NoSubRegister)
|
||||
continue;
|
||||
// FIXME: Need undef support.
|
||||
if (Src.getReg().isVirtual()) {
|
||||
auto *DefMI = MRI.getVRegDef(Src.getReg());
|
||||
if (!DefMI || DefMI->isImplicitDef())
|
||||
continue;
|
||||
}
|
||||
|
||||
LLVM_DEBUG(dbgs() << "Lower COPY: " << MI);
|
||||
unsigned Opcode = (TRI.getRegSizeInBits(Src.getReg(), MRI) == 64)
|
||||
? AMDGPU::V_MOV_B64_PSEUDO
|
||||
: AMDGPU::V_MOV_B32_e32;
|
||||
auto DstReg = MI.getOperand(0).getReg();
|
||||
auto MIB = BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(Opcode), DstReg)
|
||||
.add(MI.getOperand(1));
|
||||
(void)MIB;
|
||||
LLVM_DEBUG(dbgs() << " to: " << *MIB.getInstr());
|
||||
MI.eraseFromParent();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Figure out which registers should be reserved for stack access. Only after
|
||||
// the function is legalized do we know all of the non-spill stack objects or if
|
||||
// calls are present.
|
||||
|
@ -11546,10 +11488,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
|
|||
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
|
||||
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
||||
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
|
||||
const SIInstrInfo *TII = Subtarget->getInstrInfo();
|
||||
|
||||
if (EnableLowerSGPRToVGPRCopy)
|
||||
lowerSGPRToVGPRCopy(MF, MRI, *TRI, *TII);
|
||||
|
||||
if (Info->isEntryFunction()) {
|
||||
// Callable functions have fixed registers used for stack access.
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
; R600-NOT: AND
|
||||
; R600: |PV.{{[XYZW]}}|
|
||||
|
||||
; SI: s_bitset0_b32 s{{[0-9]+}}, 31
|
||||
; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
|
||||
; VI: s_bitset0_b32 s{{[0-9]+}}, 31
|
||||
define amdgpu_kernel void @s_fabs_fn_free(float addrspace(1)* %out, i32 %in) {
|
||||
%bc= bitcast i32 %in to float
|
||||
|
@ -24,7 +24,7 @@ define amdgpu_kernel void @s_fabs_fn_free(float addrspace(1)* %out, i32 %in) {
|
|||
; R600-NOT: AND
|
||||
; R600: |PV.{{[XYZW]}}|
|
||||
|
||||
; SI: s_bitset0_b32 s{{[0-9]+}}, 31
|
||||
; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
|
||||
; VI: s_bitset0_b32 s{{[0-9]+}}, 31
|
||||
define amdgpu_kernel void @s_fabs_free(float addrspace(1)* %out, i32 %in) {
|
||||
%bc= bitcast i32 %in to float
|
||||
|
@ -36,7 +36,7 @@ define amdgpu_kernel void @s_fabs_free(float addrspace(1)* %out, i32 %in) {
|
|||
; FUNC-LABEL: {{^}}s_fabs_f32:
|
||||
; R600: |{{(PV|T[0-9])\.[XYZW]}}|
|
||||
|
||||
; SI: s_bitset0_b32 s{{[0-9]+}}, 31
|
||||
; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
|
||||
; VI: s_bitset0_b32 s{{[0-9]+}}, 31
|
||||
define amdgpu_kernel void @s_fabs_f32(float addrspace(1)* %out, float %in) {
|
||||
%fabs = call float @llvm.fabs.f32(float %in)
|
||||
|
|
|
@ -34,7 +34,7 @@ define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x
|
|||
; R600: |PV.{{[XYZW]}}|
|
||||
; R600: -PV
|
||||
|
||||
; SI: s_bitset1_b32 s{{[0-9]+}}, 31
|
||||
; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
|
||||
; VI: s_bitset1_b32 s{{[0-9]+}}, 31
|
||||
define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
|
||||
%bc = bitcast i32 %in to float
|
||||
|
@ -49,7 +49,7 @@ define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in)
|
|||
; R600: |PV.{{[XYZW]}}|
|
||||
; R600: -PV
|
||||
|
||||
; SI: s_bitset1_b32 s{{[0-9]+}}, 31
|
||||
; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
|
||||
define amdgpu_kernel void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
|
||||
%bc = bitcast i32 %in to float
|
||||
%fabs = call float @fabs(float %bc)
|
||||
|
@ -59,7 +59,7 @@ define amdgpu_kernel void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %
|
|||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}fneg_fabs_f32:
|
||||
; SI: s_bitset1_b32 s{{[0-9]+}}, 31
|
||||
; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
|
||||
define amdgpu_kernel void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
|
||||
%fabs = call float @llvm.fabs.f32(float %in)
|
||||
%fsub = fsub float -0.000000e+00, %fabs
|
||||
|
|
|
@ -1,26 +0,0 @@
|
|||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -o - %s | FileCheck %s
|
||||
|
||||
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7"
|
||||
target triple = "amdgcn-amd-amdhsa"
|
||||
|
||||
; CHECK-LABEL: {{^}}t0:
|
||||
; CHECK: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], s[4:5], 0x0
|
||||
; CHECK-COUNT-1: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]]
|
||||
; CHECK-NOT: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]]
|
||||
define protected amdgpu_kernel void @t0(float addrspace(1)* %p, i32 %i0, i32 %j0, i32 %k0) {
|
||||
entry:
|
||||
%0 = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%i = add i32 %0, %i0
|
||||
%j = add i32 %0, %j0
|
||||
%k = add i32 %0, %k0
|
||||
%pi = getelementptr float, float addrspace(1)* %p, i32 %i
|
||||
%vi = load float, float addrspace(1)* %pi
|
||||
%pj = getelementptr float, float addrspace(1)* %p, i32 %j
|
||||
%vj = load float, float addrspace(1)* %pj
|
||||
%sum = fadd float %vi, %vj
|
||||
%pk = getelementptr float, float addrspace(1)* %p, i32 %k
|
||||
store float %sum, float addrspace(1)* %pk
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x()
|
|
@ -153,9 +153,7 @@ bb:
|
|||
|
||||
; GCN-LABEL: barrier_vmcnt_vscnt_flat_workgroup:
|
||||
; GCN: flat_load_dword
|
||||
; GFX8_9: s_waitcnt lgkmcnt(0){{$}}
|
||||
; GFX8_9: s_waitcnt vmcnt(0){{$}}
|
||||
; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
|
||||
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
|
||||
; GFX10: s_waitcnt_vscnt null, 0x0
|
||||
; GCN-NEXT: s_barrier
|
||||
define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(i32* %arg) {
|
||||
|
|
|
@ -650,12 +650,12 @@ main_body:
|
|||
; CHECK: image_store
|
||||
; CHECK: s_wqm_b64 exec, exec
|
||||
; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0
|
||||
; CHECK-DAG: v_mov_b32_e32 [[SEVEN:v[0-9]+]], 0x40e00000
|
||||
; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000
|
||||
|
||||
; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body
|
||||
; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
|
||||
; CHECK: [[LOOP:BB[0-9]+_[0-9]+]]: ; %loop
|
||||
; CHECK: v_cmp_gt_f32_e32 vcc, [[CTR]], [[SEVEN]]
|
||||
; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]]
|
||||
; CHECK: s_cbranch_vccz [[LOOPHDR]]
|
||||
|
||||
; CHECK: ; %break
|
||||
|
|
Loading…
Reference in New Issue