From 626a31de15212a0e0c25df8435753cb9a0684668 Mon Sep 17 00:00:00 2001 From: Jon Chesterfield Date: Thu, 18 Mar 2021 17:00:41 +0000 Subject: [PATCH] [libomptarget] Add register usage info to kernel metadata Add register usage information to the runtime metadata so that it can be used during kernel launch (that change will be in a different commit). Add this information to the kernel trace. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D98829 --- .../plugins/amdgpu/impl/internal.h | 4 +++ .../plugins/amdgpu/impl/system.cpp | 26 ++++++++++++++++++- .../libomptarget/plugins/amdgpu/src/rtl.cpp | 20 +++++++++++--- 3 files changed, 46 insertions(+), 4 deletions(-) diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h index 1b1d69328785..8ca66a9d478e 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h @@ -97,6 +97,10 @@ typedef struct atl_kernel_info_s { uint64_t kernel_object; uint32_t group_segment_size; uint32_t private_segment_size; + uint32_t sgpr_count; + uint32_t vgpr_count; + uint32_t sgpr_spill_count; + uint32_t vgpr_spill_count; uint32_t kernel_segment_size; uint32_t num_args; std::vector arg_alignments; diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp index da152b4045d1..d6cde1f699c2 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp @@ -832,7 +832,31 @@ static hsa_status_t get_code_object_custom_metadata(void *binary, msgpack_errors += map_lookup_string(element, ".symbol", &symbolName); msgpackErrorCheck(strings lookup in kernel metadata, msgpack_errors); - atl_kernel_info_t info = {0, 0, 0, 0, 0, {}, {}, {}}; + atl_kernel_info_t info = {0, 0, 0, 0, 0, 0, 0, 0, 0, {}, {}, {}}; + + uint64_t sgpr_count, vgpr_count, sgpr_spill_count, vgpr_spill_count; + msgpack_errors += map_lookup_uint64_t(element, ".sgpr_count", &sgpr_count); + msgpackErrorCheck(sgpr count metadata lookup in kernel metadata, + msgpack_errors); + info.sgpr_count = sgpr_count; + + msgpack_errors += map_lookup_uint64_t(element, ".vgpr_count", &vgpr_count); + msgpackErrorCheck(vgpr count metadata lookup in kernel metadata, + msgpack_errors); + info.vgpr_count = vgpr_count; + + msgpack_errors += + map_lookup_uint64_t(element, ".sgpr_spill_count", &sgpr_spill_count); + msgpackErrorCheck(sgpr spill count metadata lookup in kernel metadata, + msgpack_errors); + info.sgpr_spill_count = sgpr_spill_count; + + msgpack_errors += + map_lookup_uint64_t(element, ".vgpr_spill_count", &vgpr_spill_count); + msgpackErrorCheck(vgpr spill count metadata lookup in kernel metadata, + msgpack_errors); + info.vgpr_spill_count = vgpr_spill_count; + size_t kernel_explicit_args_size = 0; uint64_t kernel_segment_size; msgpack_errors += map_lookup_uint64_t(element, ".kernarg_segment_size", diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index 0e8df9e9ca60..a6b426dc0557 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -1759,6 +1759,19 @@ int32_t __tgt_rtl_run_target_team_region_locked( KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; + std::string kernel_name = std::string(KernelInfo->Name); + uint32_t sgpr_count, vgpr_count, sgpr_spill_count, vgpr_spill_count; + + { + assert(KernelInfoTable[device_id].find(kernel_name) != + KernelInfoTable[device_id].end()); + auto it = KernelInfoTable[device_id][kernel_name]; + sgpr_count = it.sgpr_count; + vgpr_count = it.vgpr_count; + sgpr_spill_count = it.sgpr_spill_count; + vgpr_spill_count = it.vgpr_spill_count; + } + /* * Set limit based on ThreadsPerGroup and GroupsPerDevice */ @@ -1780,10 +1793,12 @@ int32_t __tgt_rtl_run_target_team_region_locked( bool traceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING); fprintf(traceToStdout ? stdout : stderr, "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) " - "reqd:(%4dX%4d) n:%s\n", + "reqd:(%4dX%4d) sgpr_count:%u vgpr_count:%u sgpr_spill_count:%u " + "vgpr_spill_count:%u tripcount:%lu n:%s\n", device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize, arg_num, num_groups, threadsPerGroup, num_teams, thread_limit, - KernelInfo->Name); + sgpr_count, vgpr_count, sgpr_spill_count, vgpr_spill_count, + loop_tripcount, KernelInfo->Name); } // Run on the device. @@ -1812,7 +1827,6 @@ int32_t __tgt_rtl_run_target_team_region_locked( packet->reserved2 = 0; // atmi writes id_ here packet->completion_signal = {0}; // may want a pool of signals - std::string kernel_name = std::string(KernelInfo->Name); { assert(KernelInfoTable[device_id].find(kernel_name) != KernelInfoTable[device_id].end());