[AMDGPU] Don't check hasStackObjects() when reserving VGPR

We have amdgpu_gfx functions that have high register pressure. If
we do not reserve VGPR for SGPR spill, we will fall into the path
to spill the SGPR to memory, which does not only have correctness issue,
but also have really bad performance.

I don't know why there is the check for hasStackObjects(), in our case,
we don't have stack objects at the time of finalizeLowering(). So just
remove the check that we always reserve a VGPR for possible SGPR spill
in non-entry functions.

Reviewed by: arsenm

Differential Revision: https://reviews.llvm.org/D98345
This commit is contained in:
Ruiling Song 2021-03-09 22:05:21 +08:00
parent 4cee5cad28
commit e8e6817d00
2 changed files with 48 additions and 2 deletions

View File

@ -11649,8 +11649,7 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
// "amdgpu-reserve-vgpr-for-sgpr-spill" option is used // "amdgpu-reserve-vgpr-for-sgpr-spill" option is used
// FIXME: We won't need this hack if we split SGPR allocation from VGPR // FIXME: We won't need this hack if we split SGPR allocation from VGPR
if (VGPRReserveforSGPRSpill && TRI->spillSGPRToVGPR() && if (VGPRReserveforSGPRSpill && TRI->spillSGPRToVGPR() &&
!Info->VGPRReservedForSGPRSpill && !Info->isEntryFunction() && !Info->VGPRReservedForSGPRSpill && !Info->isEntryFunction())
MF.getFrameInfo().hasStackObjects())
Info->reserveVGPRforSGPRSpills(MF); Info->reserveVGPRforSGPRSpills(MF);
} }

View File

@ -187,4 +187,51 @@ define void @reserve_vgpr_with_tail_call() #0 {
ret void ret void
} }
; GCN-LABEL: {{^}}reserve_vgpr_for_sgpr_spills_no_alloca:
; GCN: v_writelane_b32 v5, s34, 0
; GCN: v_writelane_b32 v5, s35, 1
; GCN: v_writelane_b32 v5, s36, 2
; GCN: v_writelane_b32 v5, s37, 3
; GCN: v_readlane_b32 s37, v5, 3
; GCN: v_readlane_b32 s36, v5, 2
; GCN: v_readlane_b32 s35, v5, 1
; GCN: v_readlane_b32 s34, v5, 0
define void @reserve_vgpr_for_sgpr_spills_no_alloca(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
%a = load <4 x i32>, <4 x i32> addrspace(1)* %in
call void asm sideeffect "",
"~{v6},~{v7},~{v8},~{v9}
,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}
,~{v40},~{v41},~{v42},~{v43},~{v44},~{v45},~{v46},~{v47},~{v48},~{v49}
,~{v50},~{v51},~{v52},~{v53},~{v54},~{v55},~{v56},~{v57},~{v58},~{v59}
,~{v60},~{v61},~{v62},~{v63},~{v64},~{v65},~{v66},~{v67},~{v68},~{v69}
,~{v70},~{v71},~{v72},~{v73},~{v74},~{v75},~{v76},~{v77},~{v78},~{v79}
,~{v80},~{v81},~{v82},~{v83},~{v84},~{v85},~{v86},~{v87},~{v88},~{v89}
,~{v90},~{v91},~{v92},~{v93},~{v94},~{v95},~{v96},~{v97},~{v98},~{v99}
,~{v100},~{v101},~{v102},~{v103},~{v104},~{v105},~{v106},~{v107},~{v108},~{v109}
,~{v110},~{v111},~{v112},~{v113},~{v114},~{v115},~{v116},~{v117},~{v118},~{v119}
,~{v120},~{v121},~{v122},~{v123},~{v124},~{v125},~{v126},~{v127},~{v128},~{v129}
,~{v130},~{v131},~{v132},~{v133},~{v134},~{v135},~{v136},~{v137},~{v138},~{v139}
,~{v140},~{v141},~{v142},~{v143},~{v144},~{v145},~{v146},~{v147},~{v148},~{v149}
,~{v150},~{v151},~{v152},~{v153},~{v154},~{v155},~{v156},~{v157},~{v158},~{v159}
,~{v160},~{v161},~{v162},~{v163},~{v164},~{v165},~{v166},~{v167},~{v168},~{v169}
,~{v170},~{v171},~{v172},~{v173},~{v174},~{v175},~{v176},~{v177},~{v178},~{v179}
,~{v180},~{v181},~{v182},~{v183},~{v184},~{v185},~{v186},~{v187},~{v188},~{v189}
,~{v190},~{v191},~{v192},~{v193},~{v194},~{v195},~{v196},~{v197},~{v198},~{v199}
,~{v200},~{v201},~{v202},~{v203},~{v204},~{v205},~{v206},~{v207},~{v208},~{v209}
,~{v210},~{v211},~{v212},~{v213},~{v214},~{v215},~{v216},~{v217},~{v218},~{v219}
,~{v220},~{v221},~{v222},~{v223},~{v224},~{v225},~{v226},~{v227},~{v228},~{v229}
,~{v230},~{v231},~{v232},~{v233},~{v234},~{v235},~{v236},~{v237},~{v238},~{v239}
,~{v240},~{v241},~{v242},~{v243},~{v244},~{v245},~{v246},~{v247},~{v248},~{v249}
,~{v250},~{v251},~{v252},~{v253},~{v254},~{v255}" () #0
call void asm sideeffect "",
"~{s34},~{s35},~{s36},~{s37}" () #0
store <4 x i32> %a, <4 x i32> addrspace(1)* %out
ret void
}
attributes #0 = { nounwind noinline norecurse "amdgpu-flat-work-group-size"="1,256" } attributes #0 = { nounwind noinline norecurse "amdgpu-flat-work-group-size"="1,256" }