[AMDGPU] Don't check hasStackObjects() when reserving VGPR

We have amdgpu_gfx functions that have high register pressure. If we do not reserve VGPR for SGPR spill, we will fall into the path to spill the SGPR to memory, which does not only have correctness issue, but also have really bad performance. I don't know why there is the check for hasStackObjects(), in our case, we don't have stack objects at the time of finalizeLowering(). So just remove the check that we always reserve a VGPR for possible SGPR spill in non-entry functions. Reviewed by: arsenm Differential Revision: https://reviews.llvm.org/D98345
2021-03-09 22:05:21 +08:00 · 2021-03-09 22:05:21 +08:00 · e8e6817d00
parent 4cee5cad28
commit e8e6817d00
2 changed files with 48 additions and 2 deletions
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@ -11649,8 +11649,7 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
  // "amdgpu-reserve-vgpr-for-sgpr-spill" option is used
  // FIXME: We won't need this hack if we split SGPR allocation from VGPR
  if (VGPRReserveforSGPRSpill && TRI->spillSGPRToVGPR() &&
-      !Info->VGPRReservedForSGPRSpill && !Info->isEntryFunction() &&
-      MF.getFrameInfo().hasStackObjects())
+      !Info->VGPRReservedForSGPRSpill && !Info->isEntryFunction())
    Info->reserveVGPRforSGPRSpills(MF);
 }

--- a/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll
@ -187,4 +187,51 @@ define void @reserve_vgpr_with_tail_call() #0 {
  ret void
 }

+; GCN-LABEL: {{^}}reserve_vgpr_for_sgpr_spills_no_alloca:
+; GCN:  v_writelane_b32 v5, s34, 0
+; GCN:  v_writelane_b32 v5, s35, 1
+; GCN:  v_writelane_b32 v5, s36, 2
+; GCN:  v_writelane_b32 v5, s37, 3
+; GCN:  v_readlane_b32 s37, v5, 3
+; GCN:  v_readlane_b32 s36, v5, 2
+; GCN:  v_readlane_b32 s35, v5, 1
+; GCN:  v_readlane_b32 s34, v5, 0
+
+define void @reserve_vgpr_for_sgpr_spills_no_alloca(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  call void asm sideeffect "",
+  "~{v6},~{v7},~{v8},~{v9}
+  ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
+  ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
+  ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}
+  ,~{v40},~{v41},~{v42},~{v43},~{v44},~{v45},~{v46},~{v47},~{v48},~{v49}
+  ,~{v50},~{v51},~{v52},~{v53},~{v54},~{v55},~{v56},~{v57},~{v58},~{v59}
+  ,~{v60},~{v61},~{v62},~{v63},~{v64},~{v65},~{v66},~{v67},~{v68},~{v69}
+  ,~{v70},~{v71},~{v72},~{v73},~{v74},~{v75},~{v76},~{v77},~{v78},~{v79}
+  ,~{v80},~{v81},~{v82},~{v83},~{v84},~{v85},~{v86},~{v87},~{v88},~{v89}
+  ,~{v90},~{v91},~{v92},~{v93},~{v94},~{v95},~{v96},~{v97},~{v98},~{v99}
+  ,~{v100},~{v101},~{v102},~{v103},~{v104},~{v105},~{v106},~{v107},~{v108},~{v109}
+  ,~{v110},~{v111},~{v112},~{v113},~{v114},~{v115},~{v116},~{v117},~{v118},~{v119}
+  ,~{v120},~{v121},~{v122},~{v123},~{v124},~{v125},~{v126},~{v127},~{v128},~{v129}
+  ,~{v130},~{v131},~{v132},~{v133},~{v134},~{v135},~{v136},~{v137},~{v138},~{v139}
+  ,~{v140},~{v141},~{v142},~{v143},~{v144},~{v145},~{v146},~{v147},~{v148},~{v149}
+  ,~{v150},~{v151},~{v152},~{v153},~{v154},~{v155},~{v156},~{v157},~{v158},~{v159}
+  ,~{v160},~{v161},~{v162},~{v163},~{v164},~{v165},~{v166},~{v167},~{v168},~{v169}
+  ,~{v170},~{v171},~{v172},~{v173},~{v174},~{v175},~{v176},~{v177},~{v178},~{v179}
+  ,~{v180},~{v181},~{v182},~{v183},~{v184},~{v185},~{v186},~{v187},~{v188},~{v189}
+  ,~{v190},~{v191},~{v192},~{v193},~{v194},~{v195},~{v196},~{v197},~{v198},~{v199}
+  ,~{v200},~{v201},~{v202},~{v203},~{v204},~{v205},~{v206},~{v207},~{v208},~{v209}
+  ,~{v210},~{v211},~{v212},~{v213},~{v214},~{v215},~{v216},~{v217},~{v218},~{v219}
+  ,~{v220},~{v221},~{v222},~{v223},~{v224},~{v225},~{v226},~{v227},~{v228},~{v229}
+  ,~{v230},~{v231},~{v232},~{v233},~{v234},~{v235},~{v236},~{v237},~{v238},~{v239}
+  ,~{v240},~{v241},~{v242},~{v243},~{v244},~{v245},~{v246},~{v247},~{v248},~{v249}
+  ,~{v250},~{v251},~{v252},~{v253},~{v254},~{v255}" () #0
+
+  call void asm sideeffect "",
+  "~{s34},~{s35},~{s36},~{s37}" () #0
+
+  store <4 x i32> %a, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
 attributes #0 = { nounwind noinline norecurse "amdgpu-flat-work-group-size"="1,256" }