AMDGPU: Refactor AsmPrinter

Avoid analyzing functions multiple times. This allows asserting that each function is only analyzed once. llvm-svn: 301938
2017-05-02 17:14:00 +00:00 · 2017-05-02 17:14:00 +00:00 · b03dd8daae
parent 7b82b4bddb
commit b03dd8daae
2 changed files with 254 additions and 137 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@ -149,11 +149,9 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
    return;

  const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
-  SIProgramInfo KernelInfo;
  amd_kernel_code_t KernelCode;
  if (STM.isAmdCodeObjectV2(*MF)) {
-    getSIProgramInfo(KernelInfo, *MF);
-    getAmdKernelCode(KernelCode, KernelInfo, *MF);
+    getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);

    OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
    getTargetStreamer().EmitAMDKernelCodeT(KernelCode);
@ -187,7 +185,26 @@ void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
  AsmPrinter::EmitGlobalVariable(GV);
 }

+bool AMDGPUAsmPrinter::doFinalization(Module &M) {
+  CallGraphResourceInfo.clear();
+  return AsmPrinter::doFinalization(M);
+}
+
+// Print comments that apply to both callable functions and entry points.
+void AMDGPUAsmPrinter::emitCommonFunctionComments(
+  uint32_t NumVGPR,
+  uint32_t NumSGPR,
+  uint32_t ScratchSize,
+  uint64_t CodeSize) {
+  OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
+  OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
+  OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
+  OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
+}
+
 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  CurrentProgramInfo = SIProgramInfo();
+
  const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();

  // The starting address of all shader programs must be 256 bytes aligned.
@ -204,11 +221,19 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
    OutStreamer->SwitchSection(ConfigSection);
  }

-  SIProgramInfo KernelInfo;
  if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-    getSIProgramInfo(KernelInfo, MF);
+    if (MFI->isEntryFunction()) {
+      getSIProgramInfo(CurrentProgramInfo, MF);
+    } else {
+      auto I = CallGraphResourceInfo.insert(
+        std::make_pair(MF.getFunction(), SIFunctionResourceInfo()));
+      SIFunctionResourceInfo &Info = I.first->second;
+      assert(I.second && "should only be called once per function");
+      Info = analyzeResourceUsage(MF);
+    }
+
    if (!STM.isAmdHsaOS()) {
-      EmitProgramInfoSI(MF, KernelInfo);
+      EmitProgramInfoSI(MF, CurrentProgramInfo);
    }
  } else {
    EmitProgramInfoR600(MF);
@ -226,72 +251,87 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
    OutStreamer->SwitchSection(CommentSection);

    if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-      if (MFI->isEntryFunction()) {
-        OutStreamer->emitRawComment(" Kernel info:", false);
-      } else {
+      if (!MFI->isEntryFunction()) {
        OutStreamer->emitRawComment(" Function info:", false);
+        SIFunctionResourceInfo &Info = CallGraphResourceInfo[MF.getFunction()];
+        emitCommonFunctionComments(
+          Info.NumVGPR,
+          Info.getTotalNumSGPRs(MF.getSubtarget<SISubtarget>()),
+          Info.PrivateSegmentSize,
+          getFunctionCodeSize(MF));
+        return false;
      }

+      OutStreamer->emitRawComment(" Kernel info:", false);
+      emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,
+                                 CurrentProgramInfo.NumSGPR,
+                                 CurrentProgramInfo.ScratchSize,
+                                 getFunctionCodeSize(MF));
+
      OutStreamer->emitRawComment(" codeLenInByte = " +
                                  Twine(getFunctionCodeSize(MF)), false);
-      OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
-                                  false);
-      OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
-                                  false);
+      OutStreamer->emitRawComment(
+        " NumSgprs: " + Twine(CurrentProgramInfo.NumSGPR), false);
+      OutStreamer->emitRawComment(
+        " NumVgprs: " + Twine(CurrentProgramInfo.NumVGPR), false);

-      OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
-                                  false);
-      OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
-                                  false);
-      OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
-                                  false);
-      OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) +
-                                  " bytes/workgroup (compile time only)", false);
+      OutStreamer->emitRawComment(
+        " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
+      OutStreamer->emitRawComment(
+        " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
+      OutStreamer->emitRawComment(
+        " ScratchSize: " + Twine(CurrentProgramInfo.ScratchSize), false);
+      OutStreamer->emitRawComment(
+        " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
+        " bytes/workgroup (compile time only)", false);

-      if (!MFI->isEntryFunction())
-        return false;
+      OutStreamer->emitRawComment(
+        " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
+      OutStreamer->emitRawComment(
+        " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);

-      OutStreamer->emitRawComment(" SGPRBlocks: " +
-                                  Twine(KernelInfo.SGPRBlocks), false);
-      OutStreamer->emitRawComment(" VGPRBlocks: " +
-                                  Twine(KernelInfo.VGPRBlocks), false);
+      OutStreamer->emitRawComment(
+        " NumSGPRsForWavesPerEU: " +
+        Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
+      OutStreamer->emitRawComment(
+        " NumVGPRsForWavesPerEU: " +
+        Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);

-      OutStreamer->emitRawComment(" NumSGPRsForWavesPerEU: " +
-                                  Twine(KernelInfo.NumSGPRsForWavesPerEU), false);
-      OutStreamer->emitRawComment(" NumVGPRsForWavesPerEU: " +
-                                  Twine(KernelInfo.NumVGPRsForWavesPerEU), false);
-
-      OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst),
-                                  false);
-      OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount),
-                                  false);
+      OutStreamer->emitRawComment(
+        " ReservedVGPRFirst: " + Twine(CurrentProgramInfo.ReservedVGPRFirst),
+        false);
+      OutStreamer->emitRawComment(
+        " ReservedVGPRCount: " + Twine(CurrentProgramInfo.ReservedVGPRCount),
+        false);

      if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) {
-        OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
-                                    Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
-        OutStreamer->emitRawComment(" DebuggerPrivateSegmentBufferSGPR: s" +
-                                    Twine(KernelInfo.DebuggerPrivateSegmentBufferSGPR), false);
+        OutStreamer->emitRawComment(
+          " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
+          Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
+        OutStreamer->emitRawComment(
+          " DebuggerPrivateSegmentBufferSGPR: s" +
+          Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false);
      }

-      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
-                                  Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)),
-                                  false);
-      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
-                                  Twine(G_00B84C_TRAP_HANDLER(KernelInfo.ComputePGMRSrc2)),
-                                  false);
-      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
-                                  Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)),
-                                  false);
-      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
-                                  Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)),
-                                  false);
-      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
-                                  Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)),
-                                  false);
-      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
-                                  Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)),
-                                  false);
-
+      OutStreamer->emitRawComment(
+        " COMPUTE_PGM_RSRC2:USER_SGPR: " +
+        Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
+      OutStreamer->emitRawComment(
+        " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
+        Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
+      OutStreamer->emitRawComment(
+        " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
+        Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+      OutStreamer->emitRawComment(
+        " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
+        Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+      OutStreamer->emitRawComment(
+        " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
+        Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+      OutStreamer->emitRawComment(
+        " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
+        Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
+        false);
    } else {
      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
      OutStreamer->emitRawComment(
@ -407,71 +447,117 @@ static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
  return false;
 }

-void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
-                                        const MachineFunction &MF) const {
-  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SIInstrInfo *TII = STM.getInstrInfo();
-  const SIRegisterInfo *RI = &TII->getRegisterInfo();
-
-
-  MCPhysReg NumVGPRReg = AMDGPU::NoRegister;
-  for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
-    if (MRI.isPhysRegUsed(Reg)) {
-      NumVGPRReg = Reg;
-      break;
-    }
-  }
-
-  MCPhysReg NumSGPRReg = AMDGPU::NoRegister;
-  for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
-    if (MRI.isPhysRegUsed(Reg)) {
-      NumSGPRReg = Reg;
-      break;
-    }
-  }
-
-  // We found the maximum register index. They start at 0, so add one to get the
-  // number of registers.
-  ProgInfo.NumVGPR = NumVGPRReg == AMDGPU::NoRegister ? 0 :
-    RI->getHWRegIndex(NumVGPRReg) + 1;
-  ProgInfo.NumSGPR = NumSGPRReg == AMDGPU::NoRegister ? 0 :
-    RI->getHWRegIndex(NumSGPRReg) + 1;
+static unsigned getNumExtraSGPRs(const SISubtarget &ST,
+                                 bool VCCUsed,
+                                 bool FlatScrUsed) {
  unsigned ExtraSGPRs = 0;
-
-  ProgInfo.VCCUsed = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
-                     MRI.isPhysRegUsed(AMDGPU::VCC_HI);
-  if (ProgInfo.VCCUsed)
+  if (VCCUsed)
    ExtraSGPRs = 2;

-  ProgInfo.FlatUsed = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
-                      MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI);
-
-  // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
-  // instructions aren't used to access the scratch buffer. Inline assembly
-  // may need it though.
-  //
-  // If we only have implicit uses of flat_scr on flat instructions, it is not
-  // really needed.
-  if (ProgInfo.FlatUsed && !MFI->hasFlatScratchInit() &&
-      (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
-       !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
-       !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
-    ProgInfo.FlatUsed = false;
-  }
-
-  if (STM.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) {
-    if (ProgInfo.FlatUsed)
+  if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) {
+    if (FlatScrUsed)
      ExtraSGPRs = 4;
  } else {
-    if (STM.isXNACKEnabled())
+    if (ST.isXNACKEnabled())
      ExtraSGPRs = 4;

-    if (ProgInfo.FlatUsed)
+    if (FlatScrUsed)
      ExtraSGPRs = 6;
  }

+  return ExtraSGPRs;
+}
+
+int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
+  const SISubtarget &ST) const {
+  return NumExplicitSGPR + getNumExtraSGPRs(ST, UsesVCC, UsesFlatScratch);
+}
+
+AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
+  const MachineFunction &MF) const {
+  SIFunctionResourceInfo Info;
+
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
+  Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
+                         MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI);
+
+  // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
+  // instructions aren't used to access the scratch buffer. Inline assembly may
+  // need it though.
+  //
+  // If we only have implicit uses of flat_scr on flat instructions, it is not
+  // really needed.
+  if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
+      (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
+       !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
+       !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
+    Info.UsesFlatScratch = false;
+  }
+
+  Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
+  Info.PrivateSegmentSize = FrameInfo.getStackSize();
+
+  if (!FrameInfo.hasCalls()) {
+    Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
+                   MRI.isPhysRegUsed(AMDGPU::VCC_HI);
+
+    // If there are no calls, MachineRegisterInfo can tell us the used register
+    // count easily.
+
+    MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
+    for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
+      if (MRI.isPhysRegUsed(Reg)) {
+        HighestVGPRReg = Reg;
+        break;
+      }
+    }
+
+    MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
+    for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
+      if (MRI.isPhysRegUsed(Reg)) {
+        HighestSGPRReg = Reg;
+        break;
+      }
+    }
+
+    // We found the maximum register index. They start at 0, so add one to get the
+    // number of registers.
+    Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 :
+      TRI.getHWRegIndex(HighestVGPRReg) + 1;
+    Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 :
+      TRI.getHWRegIndex(HighestSGPRReg) + 1;
+
+    return Info;
+  }
+
+  llvm_unreachable("calls not implemented");
+}
+
+void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
+                                        const MachineFunction &MF) {
+  SIFunctionResourceInfo Info = analyzeResourceUsage(MF);
+
+  ProgInfo.NumVGPR = Info.NumVGPR;
+  ProgInfo.NumSGPR = Info.NumExplicitSGPR;
+  ProgInfo.ScratchSize = Info.PrivateSegmentSize;
+  ProgInfo.VCCUsed = Info.UsesVCC;
+  ProgInfo.FlatUsed = Info.UsesFlatScratch;
+  ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
+
+  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const SIInstrInfo *TII = STM.getInstrInfo();
+  const SIRegisterInfo *RI = &TII->getRegisterInfo();
+
+  unsigned ExtraSGPRs = getNumExtraSGPRs(STM,
+                                         ProgInfo.VCCUsed,
+                                         ProgInfo.FlatUsed);
  unsigned ExtraVGPRs = STM.getReservedNumVGPRs(MF);

  // Check the addressable register limit before we add ExtraSGPRs.
@ -574,9 +660,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
  // Make clamp modifier on NaN input returns 0.
  ProgInfo.DX10Clamp = STM.enableDX10Clamp();

-  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
-  ProgInfo.ScratchSize = FrameInfo.getStackSize();
-
  unsigned LDSAlignShift;
  if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) {
    // LDS is allocated in 64 dword blocks.
@ -646,7 +729,7 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) {
 }

 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
-                                         const SIProgramInfo &KernelInfo) {
+                                         const SIProgramInfo &CurrentProgramInfo) {
  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
  unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv());
@ -654,29 +737,29 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
  if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
    OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);

-    OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);
+    OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc1, 4);

    OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
-    OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4);
+    OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc2, 4);

    OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
-    OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4);
+    OutStreamer->EmitIntValue(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);

    // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
    // 0" comment but I don't see a corresponding field in the register spec.
  } else {
    OutStreamer->EmitIntValue(RsrcReg, 4);
-    OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
-                              S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
+    OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
+                              S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
    if (STM.isVGPRSpillingEnabled(*MF.getFunction())) {
      OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
-      OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4);
+      OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
    }
  }

  if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) {
    OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
-    OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
+    OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4);
    OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
    OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
    OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
@ -704,7 +787,7 @@ static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
 }

 void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
-                                        const SIProgramInfo &KernelInfo,
+                                        const SIProgramInfo &CurrentProgramInfo,
                                        const MachineFunction &MF) const {
  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
@ -712,10 +795,13 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
  AMDGPU::initDefaultAMDKernelCodeT(Out, STM.getFeatureBits());

  Out.compute_pgm_resource_registers =
-      KernelInfo.ComputePGMRSrc1 |
-      (KernelInfo.ComputePGMRSrc2 << 32);
+      CurrentProgramInfo.ComputePGMRSrc1 |
+      (CurrentProgramInfo.ComputePGMRSrc2 << 32);
  Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64;

+  if (CurrentProgramInfo.DynamicCallStack)
+    Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
+
  AMD_HSA_BITS_SET(Out.code_properties,
                   AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
                   getElementByteSizeValue(STM.getMaxPrivateElementSize()));
@ -767,12 +853,12 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
  // FIXME: Should use getKernArgSize
  Out.kernarg_segment_byte_size =
    STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset());
-  Out.wavefront_sgpr_count = KernelInfo.NumSGPR;
-  Out.workitem_vgpr_count = KernelInfo.NumVGPR;
-  Out.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
-  Out.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
-  Out.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst;
-  Out.reserved_vgpr_count = KernelInfo.ReservedVGPRCount;
+  Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
+  Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
+  Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
+  Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
+  Out.reserved_vgpr_first = CurrentProgramInfo.ReservedVGPRFirst;
+  Out.reserved_vgpr_count = CurrentProgramInfo.ReservedVGPRCount;

  // These alignment values are specified in powers of two, so alignment =
  // 2^n.  The minimum alignment is 2^4 = 16.
@ -781,9 +867,9 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,

  if (STM.debuggerEmitPrologue()) {
    Out.debug_wavefront_private_segment_offset_sgpr =
-      KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
+      CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
    Out.debug_private_segment_buffer_sgpr =
-      KernelInfo.DebuggerPrivateSegmentBufferSGPR;
+      CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR;
  }
 }

--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@ -30,9 +30,26 @@ namespace llvm {

 class AMDGPUTargetStreamer;
 class MCOperand;
+class SISubtarget;

 class AMDGPUAsmPrinter final : public AsmPrinter {
 private:
+  // Track resource usage for callee functions.
+  struct SIFunctionResourceInfo {
+    // Track the number of explicitly used VGPRs. Special registers reserved at
+    // the end are tracked separately.
+    int32_t NumVGPR = 0;
+    int32_t NumExplicitSGPR = 0;
+    uint32_t PrivateSegmentSize = 0;
+    bool UsesVCC = false;
+    bool UsesFlatScratch = false;
+    bool HasDynamicallySizedStack = false;
+    bool HasRecursion = false;
+
+    int32_t getTotalNumSGPRs(const SISubtarget &ST) const;
+  };
+
+  // Track resource usage for kernels / entry functions.
  struct SIProgramInfo {
    // Fields set in PGM_RSRC1 pm4 packet.
    uint32_t VGPRBlocks = 0;
@ -83,14 +100,23 @@ private:
    uint16_t DebuggerPrivateSegmentBufferSGPR =
        std::numeric_limits<uint16_t>::max();

+    // Whether there is recursion, dynamic allocas, indirect calls or some other
+    // reason there may be statically unknown stack usage.
+    bool DynamicCallStack = false;
+
    // Bonus information for debugging.
    bool VCCUsed = false;

    SIProgramInfo() = default;
  };

+  SIProgramInfo CurrentProgramInfo;
+  DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;
+
  uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
-  void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const;
+  SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF) const;
+
+  void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
  void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo,
                        const MachineFunction &MF) const;
  void findNumUsedRegistersSI(const MachineFunction &MF,
@ -101,6 +127,10 @@ private:
  /// can correctly setup the GPU state.
  void EmitProgramInfoR600(const MachineFunction &MF);
  void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo);
+  void emitCommonFunctionComments(uint32_t NumVGPR,
+                                  uint32_t NumSGPR,
+                                  uint32_t ScratchSize,
+                                  uint64_t CodeSize);

 public:
  explicit AMDGPUAsmPrinter(TargetMachine &TM,
@ -112,6 +142,7 @@ public:

  AMDGPUTargetStreamer& getTargetStreamer() const;

+  bool doFinalization(Module &M) override;
  bool runOnMachineFunction(MachineFunction &MF) override;

  /// \brief Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated