forked from OSchip/llvm-project
Revert "[AMDGPU] Support disassembly for AMDGPU kernel descriptors"
This reverts commit 487a805310
.
Tests fail on big endian machines.
This commit is contained in:
parent
d816499f95
commit
f078577f31
|
@ -162,49 +162,39 @@ struct kernel_descriptor_t {
|
|||
uint8_t reserved2[6];
|
||||
};
|
||||
|
||||
enum : uint32_t {
|
||||
GROUP_SEGMENT_FIXED_SIZE_OFFSET = 0,
|
||||
PRIVATE_SEGMENT_FIXED_SIZE_OFFSET = 4,
|
||||
RESERVED0_OFFSET = 8,
|
||||
KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET = 16,
|
||||
RESERVED1_OFFSET = 24,
|
||||
COMPUTE_PGM_RSRC3_OFFSET = 44,
|
||||
COMPUTE_PGM_RSRC1_OFFSET = 48,
|
||||
COMPUTE_PGM_RSRC2_OFFSET = 52,
|
||||
KERNEL_CODE_PROPERTIES_OFFSET = 56,
|
||||
RESERVED2_OFFSET = 58,
|
||||
};
|
||||
|
||||
static_assert(
|
||||
sizeof(kernel_descriptor_t) == 64,
|
||||
"invalid size for kernel_descriptor_t");
|
||||
static_assert(offsetof(kernel_descriptor_t, group_segment_fixed_size) ==
|
||||
GROUP_SEGMENT_FIXED_SIZE_OFFSET,
|
||||
"invalid offset for group_segment_fixed_size");
|
||||
static_assert(offsetof(kernel_descriptor_t, private_segment_fixed_size) ==
|
||||
PRIVATE_SEGMENT_FIXED_SIZE_OFFSET,
|
||||
"invalid offset for private_segment_fixed_size");
|
||||
static_assert(offsetof(kernel_descriptor_t, reserved0) == RESERVED0_OFFSET,
|
||||
"invalid offset for reserved0");
|
||||
static_assert(offsetof(kernel_descriptor_t, kernel_code_entry_byte_offset) ==
|
||||
KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET,
|
||||
"invalid offset for kernel_code_entry_byte_offset");
|
||||
static_assert(offsetof(kernel_descriptor_t, reserved1) == RESERVED1_OFFSET,
|
||||
"invalid offset for reserved1");
|
||||
static_assert(offsetof(kernel_descriptor_t, compute_pgm_rsrc3) ==
|
||||
COMPUTE_PGM_RSRC3_OFFSET,
|
||||
"invalid offset for compute_pgm_rsrc3");
|
||||
static_assert(offsetof(kernel_descriptor_t, compute_pgm_rsrc1) ==
|
||||
COMPUTE_PGM_RSRC1_OFFSET,
|
||||
"invalid offset for compute_pgm_rsrc1");
|
||||
static_assert(offsetof(kernel_descriptor_t, compute_pgm_rsrc2) ==
|
||||
COMPUTE_PGM_RSRC2_OFFSET,
|
||||
"invalid offset for compute_pgm_rsrc2");
|
||||
static_assert(offsetof(kernel_descriptor_t, kernel_code_properties) ==
|
||||
KERNEL_CODE_PROPERTIES_OFFSET,
|
||||
"invalid offset for kernel_code_properties");
|
||||
static_assert(offsetof(kernel_descriptor_t, reserved2) == RESERVED2_OFFSET,
|
||||
"invalid offset for reserved2");
|
||||
static_assert(
|
||||
offsetof(kernel_descriptor_t, group_segment_fixed_size) == 0,
|
||||
"invalid offset for group_segment_fixed_size");
|
||||
static_assert(
|
||||
offsetof(kernel_descriptor_t, private_segment_fixed_size) == 4,
|
||||
"invalid offset for private_segment_fixed_size");
|
||||
static_assert(
|
||||
offsetof(kernel_descriptor_t, reserved0) == 8,
|
||||
"invalid offset for reserved0");
|
||||
static_assert(
|
||||
offsetof(kernel_descriptor_t, kernel_code_entry_byte_offset) == 16,
|
||||
"invalid offset for kernel_code_entry_byte_offset");
|
||||
static_assert(
|
||||
offsetof(kernel_descriptor_t, reserved1) == 24,
|
||||
"invalid offset for reserved1");
|
||||
static_assert(
|
||||
offsetof(kernel_descriptor_t, compute_pgm_rsrc3) == 44,
|
||||
"invalid offset for compute_pgm_rsrc3");
|
||||
static_assert(
|
||||
offsetof(kernel_descriptor_t, compute_pgm_rsrc1) == 48,
|
||||
"invalid offset for compute_pgm_rsrc1");
|
||||
static_assert(
|
||||
offsetof(kernel_descriptor_t, compute_pgm_rsrc2) == 52,
|
||||
"invalid offset for compute_pgm_rsrc2");
|
||||
static_assert(
|
||||
offsetof(kernel_descriptor_t, kernel_code_properties) == 56,
|
||||
"invalid offset for kernel_code_properties");
|
||||
static_assert(
|
||||
offsetof(kernel_descriptor_t, reserved2) == 58,
|
||||
"invalid offset for reserved2");
|
||||
|
||||
} // end namespace amdhsa
|
||||
} // end namespace llvm
|
||||
|
|
|
@ -34,7 +34,6 @@
|
|||
#include "llvm/MC/MCFixedLenDisassembler.h"
|
||||
#include "llvm/MC/MCInst.h"
|
||||
#include "llvm/MC/MCSubtargetInfo.h"
|
||||
#include "llvm/Support/AMDHSAKernelDescriptor.h"
|
||||
#include "llvm/Support/Endian.h"
|
||||
#include "llvm/Support/ErrorHandling.h"
|
||||
#include "llvm/Support/MathExtras.h"
|
||||
|
@ -1216,350 +1215,6 @@ bool AMDGPUDisassembler::isGFX10() const {
|
|||
return STI.getFeatureBits()[AMDGPU::FeatureGFX10];
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// AMDGPU specific symbol handling
|
||||
//===----------------------------------------------------------------------===//
|
||||
#define PRINT_DIRECTIVE(DIRECTIVE, MASK) \
|
||||
do { \
|
||||
KdStream << Indent << DIRECTIVE " " \
|
||||
<< ((FourByteBuffer & MASK) >> (MASK##_SHIFT)) << '\n'; \
|
||||
} while (0)
|
||||
|
||||
// NOLINTNEXTLINE(readability-identifier-naming)
|
||||
MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
|
||||
uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
|
||||
using namespace amdhsa;
|
||||
StringRef Indent = "\t";
|
||||
|
||||
// We cannot accurately backward compute #VGPRs used from
|
||||
// GRANULATED_WORKITEM_VGPR_COUNT. But we are concerned with getting the same
|
||||
// value of GRANULATED_WORKITEM_VGPR_COUNT in the reassembled binary. So we
|
||||
// simply calculate the inverse of what the assembler does.
|
||||
|
||||
uint32_t GranulatedWorkitemVGPRCount =
|
||||
(FourByteBuffer & COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT) >>
|
||||
COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT;
|
||||
|
||||
uint32_t NextFreeVGPR = (GranulatedWorkitemVGPRCount + 1) *
|
||||
AMDGPU::IsaInfo::getVGPREncodingGranule(&STI);
|
||||
|
||||
KdStream << Indent << ".amdhsa_next_free_vgpr " << NextFreeVGPR << '\n';
|
||||
|
||||
// We cannot backward compute values used to calculate
|
||||
// GRANULATED_WAVEFRONT_SGPR_COUNT. Hence the original values for following
|
||||
// directives can't be computed:
|
||||
// .amdhsa_reserve_vcc
|
||||
// .amdhsa_reserve_flat_scratch
|
||||
// .amdhsa_reserve_xnack_mask
|
||||
// They take their respective default values if not specified in the assembly.
|
||||
//
|
||||
// GRANULATED_WAVEFRONT_SGPR_COUNT
|
||||
// = f(NEXT_FREE_SGPR + VCC + FLAT_SCRATCH + XNACK_MASK)
|
||||
//
|
||||
// We compute the inverse as though all directives apart from NEXT_FREE_SGPR
|
||||
// are set to 0. So while disassembling we consider that:
|
||||
//
|
||||
// GRANULATED_WAVEFRONT_SGPR_COUNT
|
||||
// = f(NEXT_FREE_SGPR + 0 + 0 + 0)
|
||||
//
|
||||
// The disassembler cannot recover the original values of those 3 directives.
|
||||
|
||||
uint32_t GranulatedWavefrontSGPRCount =
|
||||
(FourByteBuffer & COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT) >>
|
||||
COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT;
|
||||
|
||||
if (isGFX10() && GranulatedWavefrontSGPRCount)
|
||||
return MCDisassembler::Fail;
|
||||
|
||||
uint32_t NextFreeSGPR = (GranulatedWavefrontSGPRCount + 1) *
|
||||
AMDGPU::IsaInfo::getSGPREncodingGranule(&STI);
|
||||
|
||||
KdStream << Indent << ".amdhsa_reserve_vcc " << 0 << '\n';
|
||||
KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n';
|
||||
KdStream << Indent << ".amdhsa_reserve_xnack_mask " << 0 << '\n';
|
||||
KdStream << Indent << ".amdhsa_next_free_sgpr " << NextFreeSGPR << "\n";
|
||||
|
||||
if (FourByteBuffer & COMPUTE_PGM_RSRC1_PRIORITY)
|
||||
return MCDisassembler::Fail;
|
||||
|
||||
PRINT_DIRECTIVE(".amdhsa_float_round_mode_32",
|
||||
COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32);
|
||||
PRINT_DIRECTIVE(".amdhsa_float_round_mode_16_64",
|
||||
COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64);
|
||||
PRINT_DIRECTIVE(".amdhsa_float_denorm_mode_32",
|
||||
COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32);
|
||||
PRINT_DIRECTIVE(".amdhsa_float_denorm_mode_16_64",
|
||||
COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64);
|
||||
|
||||
if (FourByteBuffer & COMPUTE_PGM_RSRC1_PRIV)
|
||||
return MCDisassembler::Fail;
|
||||
|
||||
PRINT_DIRECTIVE(".amdhsa_dx10_clamp", COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP);
|
||||
|
||||
if (FourByteBuffer & COMPUTE_PGM_RSRC1_DEBUG_MODE)
|
||||
return MCDisassembler::Fail;
|
||||
|
||||
PRINT_DIRECTIVE(".amdhsa_ieee_mode", COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE);
|
||||
|
||||
if (FourByteBuffer & COMPUTE_PGM_RSRC1_BULKY)
|
||||
return MCDisassembler::Fail;
|
||||
|
||||
if (FourByteBuffer & COMPUTE_PGM_RSRC1_CDBG_USER)
|
||||
return MCDisassembler::Fail;
|
||||
|
||||
PRINT_DIRECTIVE(".amdhsa_fp16_overflow", COMPUTE_PGM_RSRC1_FP16_OVFL);
|
||||
|
||||
if (FourByteBuffer & COMPUTE_PGM_RSRC1_RESERVED0)
|
||||
return MCDisassembler::Fail;
|
||||
|
||||
if (isGFX10()) {
|
||||
PRINT_DIRECTIVE(".amdhsa_workgroup_processor_mode",
|
||||
COMPUTE_PGM_RSRC1_WGP_MODE);
|
||||
PRINT_DIRECTIVE(".amdhsa_memory_ordered", COMPUTE_PGM_RSRC1_MEM_ORDERED);
|
||||
PRINT_DIRECTIVE(".amdhsa_forward_progress", COMPUTE_PGM_RSRC1_FWD_PROGRESS);
|
||||
}
|
||||
return MCDisassembler::Success;
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(readability-identifier-naming)
|
||||
MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC2(
|
||||
uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
|
||||
using namespace amdhsa;
|
||||
StringRef Indent = "\t";
|
||||
PRINT_DIRECTIVE(
|
||||
".amdhsa_system_sgpr_private_segment_wavefront_offset",
|
||||
COMPUTE_PGM_RSRC2_ENABLE_SGPR_PRIVATE_SEGMENT_WAVEFRONT_OFFSET);
|
||||
PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_x",
|
||||
COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
|
||||
PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_y",
|
||||
COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y);
|
||||
PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_z",
|
||||
COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z);
|
||||
PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_info",
|
||||
COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO);
|
||||
PRINT_DIRECTIVE(".amdhsa_system_vgpr_workitem_id",
|
||||
COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID);
|
||||
|
||||
if (FourByteBuffer & COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_ADDRESS_WATCH)
|
||||
return MCDisassembler::Fail;
|
||||
|
||||
if (FourByteBuffer & COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_MEMORY)
|
||||
return MCDisassembler::Fail;
|
||||
|
||||
if (FourByteBuffer & COMPUTE_PGM_RSRC2_GRANULATED_LDS_SIZE)
|
||||
return MCDisassembler::Fail;
|
||||
|
||||
PRINT_DIRECTIVE(
|
||||
".amdhsa_exception_fp_ieee_invalid_op",
|
||||
COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION);
|
||||
PRINT_DIRECTIVE(".amdhsa_exception_fp_denorm_src",
|
||||
COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE);
|
||||
PRINT_DIRECTIVE(
|
||||
".amdhsa_exception_fp_ieee_div_zero",
|
||||
COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO);
|
||||
PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_overflow",
|
||||
COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW);
|
||||
PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_underflow",
|
||||
COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW);
|
||||
PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_inexact",
|
||||
COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT);
|
||||
PRINT_DIRECTIVE(".amdhsa_exception_int_div_zero",
|
||||
COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO);
|
||||
|
||||
if (FourByteBuffer & COMPUTE_PGM_RSRC2_RESERVED0)
|
||||
return MCDisassembler::Fail;
|
||||
|
||||
return MCDisassembler::Success;
|
||||
}
|
||||
|
||||
#undef PRINT_DIRECTIVE
|
||||
|
||||
MCDisassembler::DecodeStatus
|
||||
AMDGPUDisassembler::decodeKernelDescriptorDirective(
|
||||
DataExtractor::Cursor &Cursor, ArrayRef<uint8_t> Bytes,
|
||||
raw_string_ostream &KdStream) const {
|
||||
#define PRINT_DIRECTIVE(DIRECTIVE, MASK) \
|
||||
do { \
|
||||
KdStream << Indent << DIRECTIVE " " \
|
||||
<< ((TwoByteBuffer & MASK) >> (MASK##_SHIFT)) << '\n'; \
|
||||
} while (0)
|
||||
|
||||
uint16_t TwoByteBuffer = 0;
|
||||
uint32_t FourByteBuffer = 0;
|
||||
uint64_t EightByteBuffer = 0;
|
||||
|
||||
StringRef ReservedBytes;
|
||||
StringRef Indent = "\t";
|
||||
|
||||
assert(Bytes.size() == 64);
|
||||
DataExtractor DE(Bytes, /*IsLittleEndian=*/true, /*AddressSize=*/8);
|
||||
|
||||
switch (Cursor.tell()) {
|
||||
case amdhsa::GROUP_SEGMENT_FIXED_SIZE_OFFSET:
|
||||
FourByteBuffer = DE.getU32(Cursor);
|
||||
KdStream << Indent << ".amdhsa_group_segment_fixed_size " << FourByteBuffer
|
||||
<< '\n';
|
||||
return MCDisassembler::Success;
|
||||
|
||||
case amdhsa::PRIVATE_SEGMENT_FIXED_SIZE_OFFSET:
|
||||
FourByteBuffer = DE.getU32(Cursor);
|
||||
KdStream << Indent << ".amdhsa_private_segment_fixed_size "
|
||||
<< FourByteBuffer << '\n';
|
||||
return MCDisassembler::Success;
|
||||
|
||||
case amdhsa::RESERVED0_OFFSET:
|
||||
// 8 reserved bytes, must be 0.
|
||||
EightByteBuffer = DE.getU64(Cursor);
|
||||
if (EightByteBuffer) {
|
||||
return MCDisassembler::Fail;
|
||||
}
|
||||
return MCDisassembler::Success;
|
||||
|
||||
case amdhsa::KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET:
|
||||
// KERNEL_CODE_ENTRY_BYTE_OFFSET
|
||||
// So far no directive controls this for Code Object V3, so simply skip for
|
||||
// disassembly.
|
||||
DE.skip(Cursor, 8);
|
||||
return MCDisassembler::Success;
|
||||
|
||||
case amdhsa::RESERVED1_OFFSET:
|
||||
// 20 reserved bytes, must be 0.
|
||||
ReservedBytes = DE.getBytes(Cursor, 20);
|
||||
for (int I = 0; I < 20; ++I) {
|
||||
if (ReservedBytes[I] != 0) {
|
||||
return MCDisassembler::Fail;
|
||||
}
|
||||
}
|
||||
return MCDisassembler::Success;
|
||||
|
||||
case amdhsa::COMPUTE_PGM_RSRC3_OFFSET:
|
||||
// COMPUTE_PGM_RSRC3
|
||||
// - Only set for GFX10, GFX6-9 have this to be 0.
|
||||
// - Currently no directives directly control this.
|
||||
FourByteBuffer = DE.getU32(Cursor);
|
||||
if (!isGFX10() && FourByteBuffer) {
|
||||
return MCDisassembler::Fail;
|
||||
}
|
||||
return MCDisassembler::Success;
|
||||
|
||||
case amdhsa::COMPUTE_PGM_RSRC1_OFFSET:
|
||||
FourByteBuffer = DE.getU32(Cursor);
|
||||
if (decodeCOMPUTE_PGM_RSRC1(FourByteBuffer, KdStream) ==
|
||||
MCDisassembler::Fail) {
|
||||
return MCDisassembler::Fail;
|
||||
}
|
||||
return MCDisassembler::Success;
|
||||
|
||||
case amdhsa::COMPUTE_PGM_RSRC2_OFFSET:
|
||||
FourByteBuffer = DE.getU32(Cursor);
|
||||
if (decodeCOMPUTE_PGM_RSRC2(FourByteBuffer, KdStream) ==
|
||||
MCDisassembler::Fail) {
|
||||
return MCDisassembler::Fail;
|
||||
}
|
||||
return MCDisassembler::Success;
|
||||
|
||||
case amdhsa::KERNEL_CODE_PROPERTIES_OFFSET:
|
||||
using namespace amdhsa;
|
||||
TwoByteBuffer = DE.getU16(Cursor);
|
||||
|
||||
PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer",
|
||||
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
|
||||
PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_ptr",
|
||||
KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
|
||||
PRINT_DIRECTIVE(".amdhsa_user_sgpr_queue_ptr",
|
||||
KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR);
|
||||
PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_segment_ptr",
|
||||
KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
|
||||
PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_id",
|
||||
KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
|
||||
PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init",
|
||||
KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
|
||||
PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size",
|
||||
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
|
||||
|
||||
if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0)
|
||||
return MCDisassembler::Fail;
|
||||
|
||||
// Reserved for GFX9
|
||||
if (isGFX9() &&
|
||||
(TwoByteBuffer & KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32)) {
|
||||
return MCDisassembler::Fail;
|
||||
} else if (isGFX10()) {
|
||||
PRINT_DIRECTIVE(".amdhsa_wavefront_size32",
|
||||
KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
|
||||
}
|
||||
|
||||
if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED1)
|
||||
return MCDisassembler::Fail;
|
||||
|
||||
return MCDisassembler::Success;
|
||||
|
||||
case amdhsa::RESERVED2_OFFSET:
|
||||
// 6 bytes from here are reserved, must be 0.
|
||||
ReservedBytes = DE.getBytes(Cursor, 6);
|
||||
for (int I = 0; I < 6; ++I) {
|
||||
if (ReservedBytes[I] != 0)
|
||||
return MCDisassembler::Fail;
|
||||
}
|
||||
return MCDisassembler::Success;
|
||||
|
||||
default:
|
||||
llvm_unreachable("Unhandled index. Case statements cover everything.");
|
||||
return MCDisassembler::Fail;
|
||||
}
|
||||
#undef PRINT_DIRECTIVE
|
||||
}
|
||||
|
||||
MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeKernelDescriptor(
|
||||
StringRef KdName, ArrayRef<uint8_t> Bytes, uint64_t KdAddress) const {
|
||||
// CP microcode requires the kernel descriptor to be 64 aligned.
|
||||
if (Bytes.size() != 64 || KdAddress % 64 != 0)
|
||||
return MCDisassembler::Fail;
|
||||
|
||||
std::string Kd;
|
||||
raw_string_ostream KdStream(Kd);
|
||||
KdStream << ".amdhsa_kernel " << KdName << '\n';
|
||||
|
||||
DataExtractor::Cursor C(0);
|
||||
while (C && C.tell() < Bytes.size()) {
|
||||
MCDisassembler::DecodeStatus Status =
|
||||
decodeKernelDescriptorDirective(C, Bytes, KdStream);
|
||||
|
||||
cantFail(C.takeError());
|
||||
|
||||
if (Status == MCDisassembler::Fail)
|
||||
return MCDisassembler::Fail;
|
||||
}
|
||||
KdStream << ".end_amdhsa_kernel\n";
|
||||
outs() << KdStream.str();
|
||||
return MCDisassembler::Success;
|
||||
}
|
||||
|
||||
Optional<MCDisassembler::DecodeStatus>
|
||||
AMDGPUDisassembler::onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
|
||||
ArrayRef<uint8_t> Bytes, uint64_t Address,
|
||||
raw_ostream &CStream) const {
|
||||
// Right now only kernel descriptor needs to be handled.
|
||||
// We ignore all other symbols for target specific handling.
|
||||
// TODO:
|
||||
// Fix the spurious symbol issue for AMDGPU kernels. Exists for both Code
|
||||
// Object V2 and V3 when symbols are marked protected.
|
||||
|
||||
// amd_kernel_code_t for Code Object V2.
|
||||
if (Symbol.Type == ELF::STT_AMDGPU_HSA_KERNEL) {
|
||||
Size = 256;
|
||||
return MCDisassembler::Fail;
|
||||
}
|
||||
|
||||
// Code Object V3 kernel descriptors.
|
||||
StringRef Name = Symbol.Name;
|
||||
if (Symbol.Type == ELF::STT_OBJECT && Name.endswith(StringRef(".kd"))) {
|
||||
Size = 64; // Size = 64 regardless of success or failure.
|
||||
return decodeKernelDescriptor(Name.drop_back(3), Bytes, Address);
|
||||
}
|
||||
return None;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// AMDGPUSymbolizer
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -17,11 +17,10 @@
|
|||
|
||||
#include "llvm/ADT/ArrayRef.h"
|
||||
#include "llvm/MC/MCContext.h"
|
||||
#include "llvm/MC/MCInstrInfo.h"
|
||||
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
|
||||
#include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
|
||||
#include "llvm/MC/MCDisassembler/MCSymbolizer.h"
|
||||
#include "llvm/MC/MCInstrInfo.h"
|
||||
#include "llvm/Support/DataExtractor.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
|
@ -67,33 +66,6 @@ public:
|
|||
DecodeStatus tryDecodeInst(const uint8_t* Table, MCInst &MI, uint64_t Inst,
|
||||
uint64_t Address) const;
|
||||
|
||||
Optional<DecodeStatus> onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
|
||||
ArrayRef<uint8_t> Bytes,
|
||||
uint64_t Address,
|
||||
raw_ostream &CStream) const override;
|
||||
|
||||
DecodeStatus decodeKernelDescriptor(StringRef KdName, ArrayRef<uint8_t> Bytes,
|
||||
uint64_t KdAddress) const;
|
||||
|
||||
DecodeStatus
|
||||
decodeKernelDescriptorDirective(DataExtractor::Cursor &Cursor,
|
||||
ArrayRef<uint8_t> Bytes,
|
||||
raw_string_ostream &KdStream) const;
|
||||
|
||||
/// Decode as directives that handle COMPUTE_PGM_RSRC1.
|
||||
/// \param FourByteBuffer - Bytes holding contents of COMPUTE_PGM_RSRC1.
|
||||
/// \param KdStream - Stream to write the disassembled directives to.
|
||||
// NOLINTNEXTLINE(readability-identifier-naming)
|
||||
DecodeStatus decodeCOMPUTE_PGM_RSRC1(uint32_t FourByteBuffer,
|
||||
raw_string_ostream &KdStream) const;
|
||||
|
||||
/// Decode as directives that handle COMPUTE_PGM_RSRC2.
|
||||
/// \param FourByteBuffer - Bytes holding contents of COMPUTE_PGM_RSRC2.
|
||||
/// \param KdStream - Stream to write the disassembled directives to.
|
||||
// NOLINTNEXTLINE(readability-identifier-naming)
|
||||
DecodeStatus decodeCOMPUTE_PGM_RSRC2(uint32_t FourByteBuffer,
|
||||
raw_string_ostream &KdStream) const;
|
||||
|
||||
DecodeStatus convertSDWAInst(MCInst &MI) const;
|
||||
DecodeStatus convertDPP8Inst(MCInst &MI) const;
|
||||
DecodeStatus convertMIMGInst(MCInst &MI) const;
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
; RUN: llc -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=fiji -filetype=obj < %s | llvm-objdump -d - --mcpu=fiji | FileCheck %s
|
||||
|
||||
; CHECK: <kernel0>:
|
||||
; CHECK: s_endpgm
|
||||
; CHECK-NEXT: s_endpgm
|
||||
define amdgpu_kernel void @kernel0() align 256 {
|
||||
entry:
|
||||
ret void
|
||||
|
@ -80,7 +80,7 @@ entry:
|
|||
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: <kernel1>:
|
||||
; CHECK: s_endpgm
|
||||
; CHECK-NEXT: s_endpgm
|
||||
define amdgpu_kernel void @kernel1(i32 addrspace(1)* addrspace(4)* %ptr.out) align 256 {
|
||||
entry:
|
||||
ret void
|
||||
|
|
|
@ -1,37 +0,0 @@
|
|||
;; Failure test. We create a malformed kernel descriptor (KD) by manually
|
||||
;; setting the bytes, because one can't create a malformed KD using the
|
||||
;; assembler directives.
|
||||
|
||||
; RUN: llvm-mc %s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t.o
|
||||
|
||||
; RUN: printf ".type my_kernel.kd, @object \nmy_kernel.kd:\n.size my_kernel.kd, 64\n" > %t1.sym_info
|
||||
; RUN: llvm-objdump --disassemble-symbols=my_kernel.kd %t.o \
|
||||
; RUN: | tail -n +9 > %t1.sym_content
|
||||
; RUN: cat %t1.sym_info %t1.sym_content > %t1.s
|
||||
|
||||
; RUN: llvm-mc %t1.s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t-re-assemble.o
|
||||
; RUN: diff %t.o %t-re-assemble.o
|
||||
|
||||
;; Test failure by setting one of the reserved bytes to non-zero value.
|
||||
|
||||
.type my_kernel.kd, @object
|
||||
.size my_kernel.kd, 64
|
||||
my_kernel.kd:
|
||||
.long 0x00000000 ;; group_segment_fixed_size
|
||||
.long 0x00000000 ;; private_segment_fixed_size
|
||||
.quad 0x00FF000000000000 ;; reserved bytes.
|
||||
.quad 0x0000000000000000 ;; kernel_code_entry_byte_offset, any value works.
|
||||
|
||||
;; 20 reserved bytes.
|
||||
.quad 0x0000000000000000
|
||||
.quad 0x0000000000000000
|
||||
.long 0x00000000
|
||||
|
||||
.long 0x00000000 ;; compute_PGM_RSRC3
|
||||
.long 0x00000000 ;; compute_PGM_RSRC1
|
||||
.long 0x00000000 ;; compute_PGM_RSRC2
|
||||
.short 0x0000 ;; additional fields.
|
||||
|
||||
;; 6 reserved bytes.
|
||||
.long 0x0000000
|
||||
.short 0x0000
|
|
@ -1,49 +0,0 @@
|
|||
;; Test disassembly for GRANULATED_WAVEFRONT_SGPR_COUNT in the kernel descriptor.
|
||||
|
||||
; RUN: split-file %s %t.dir
|
||||
|
||||
; RUN: llvm-mc %t.dir/1.s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t1
|
||||
; RUN: llvm-objdump --disassemble-symbols=my_kernel_1.kd %t1 | tail -n +8 \
|
||||
; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t1-re-assemble
|
||||
; RUN: diff %t1 %t1-re-assemble
|
||||
|
||||
; RUN: llvm-mc %t.dir/2.s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t2
|
||||
; RUN: llvm-objdump --disassemble-symbols=my_kernel_2.kd %t2 | tail -n +8 \
|
||||
; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t2-re-assemble
|
||||
; RUN: diff %t2 %t2-re-assemble
|
||||
|
||||
; RUN: llvm-mc %t.dir/3.s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t3
|
||||
; RUN: llvm-objdump --disassemble-symbols=my_kernel_3.kd %t3 | tail -n +8 \
|
||||
; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t3-re-assemble
|
||||
; RUN: diff %t3 %t3-re-assemble
|
||||
|
||||
|
||||
;--- 1.s
|
||||
;; Only set next_free_sgpr.
|
||||
.amdhsa_kernel my_kernel_1
|
||||
.amdhsa_next_free_vgpr 0
|
||||
.amdhsa_next_free_sgpr 42
|
||||
.amdhsa_reserve_flat_scratch 0
|
||||
.amdhsa_reserve_xnack_mask 0
|
||||
.amdhsa_reserve_vcc 0
|
||||
.end_amdhsa_kernel
|
||||
|
||||
;--- 2.s
|
||||
;; Only set other directives.
|
||||
.amdhsa_kernel my_kernel_2
|
||||
.amdhsa_next_free_vgpr 0
|
||||
.amdhsa_next_free_sgpr 0
|
||||
.amdhsa_reserve_flat_scratch 1
|
||||
.amdhsa_reserve_xnack_mask 1
|
||||
.amdhsa_reserve_vcc 1
|
||||
.end_amdhsa_kernel
|
||||
|
||||
;--- 3.s
|
||||
;; Set all affecting directives.
|
||||
.amdhsa_kernel my_kernel_3
|
||||
.amdhsa_next_free_vgpr 0
|
||||
.amdhsa_next_free_sgpr 35
|
||||
.amdhsa_reserve_flat_scratch 1
|
||||
.amdhsa_reserve_xnack_mask 1
|
||||
.amdhsa_reserve_vcc 1
|
||||
.end_amdhsa_kernel
|
|
@ -1,36 +0,0 @@
|
|||
;; Test disassembly for GRANULATED_WORKITEM_VGPR_COUNT in the kernel descriptor.
|
||||
|
||||
; RUN: split-file %s %t.dir
|
||||
|
||||
; RUN: llvm-mc %t.dir/1.s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t1
|
||||
; RUN: llvm-objdump --disassemble-symbols=my_kernel_1.kd %t1 | tail -n +8 \
|
||||
; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t1-re-assemble
|
||||
; RUN: diff %t1 %t1-re-assemble
|
||||
|
||||
; RUN: llvm-mc %t.dir/2.s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t2
|
||||
; RUN: llvm-objdump --disassemble-symbols=my_kernel_2.kd %t2 | tail -n +8 \
|
||||
; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t2-re-assemble
|
||||
; RUN: diff %t2 %t2-re-assemble
|
||||
|
||||
; RUN: llvm-mc %t.dir/3.s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t3
|
||||
; RUN: llvm-objdump --disassemble-symbols=my_kernel_3.kd %t3 | tail -n +8 \
|
||||
; RUN: | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t3-re-assemble
|
||||
; RUN: diff %t3 %t3-re-assemble
|
||||
|
||||
;--- 1.s
|
||||
.amdhsa_kernel my_kernel_1
|
||||
.amdhsa_next_free_vgpr 23
|
||||
.amdhsa_next_free_sgpr 0
|
||||
.end_amdhsa_kernel
|
||||
|
||||
;--- 2.s
|
||||
.amdhsa_kernel my_kernel_2
|
||||
.amdhsa_next_free_vgpr 14
|
||||
.amdhsa_next_free_sgpr 0
|
||||
.end_amdhsa_kernel
|
||||
|
||||
;--- 3.s
|
||||
.amdhsa_kernel my_kernel_3
|
||||
.amdhsa_next_free_vgpr 32
|
||||
.amdhsa_next_free_sgpr 0
|
||||
.end_amdhsa_kernel
|
|
@ -1,58 +0,0 @@
|
|||
;; Entirely zeroed kernel descriptor (for GFX10).
|
||||
|
||||
; RUN: llvm-mc %s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx1010 -filetype=obj -o %t
|
||||
; RUN: llvm-objdump -s -j .text %t | FileCheck --check-prefix=OBJDUMP %s
|
||||
|
||||
;; TODO:
|
||||
;; This file and kd-zeroed-raw.s should produce the same output for the kernel
|
||||
;; descriptor - a block of 64 zeroed bytes. But looks like the assembler sets
|
||||
;; the FWD_PROGRESS bit in COMPUTE_PGM_RSRC1 to 1 even when the directive
|
||||
;; mentions 0 (see line 36).
|
||||
|
||||
;; Check the raw bytes right now.
|
||||
|
||||
; OBJDUMP: 0000 00000000 00000000 00000000 00000000
|
||||
; OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
|
||||
; OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
|
||||
; OBJDUMP-NEXT: 0030 01000000 00000000 00000000 00000000
|
||||
|
||||
.amdhsa_kernel my_kernel
|
||||
.amdhsa_group_segment_fixed_size 0
|
||||
.amdhsa_private_segment_fixed_size 0
|
||||
.amdhsa_next_free_vgpr 8
|
||||
.amdhsa_reserve_vcc 0
|
||||
.amdhsa_reserve_flat_scratch 0
|
||||
.amdhsa_reserve_xnack_mask 0
|
||||
.amdhsa_next_free_sgpr 8
|
||||
.amdhsa_float_round_mode_32 0
|
||||
.amdhsa_float_round_mode_16_64 0
|
||||
.amdhsa_float_denorm_mode_32 0
|
||||
.amdhsa_float_denorm_mode_16_64 0
|
||||
.amdhsa_dx10_clamp 0
|
||||
.amdhsa_ieee_mode 0
|
||||
.amdhsa_fp16_overflow 0
|
||||
.amdhsa_workgroup_processor_mode 0
|
||||
.amdhsa_memory_ordered 0
|
||||
.amdhsa_forward_progress 0
|
||||
.amdhsa_system_sgpr_private_segment_wavefront_offset 0
|
||||
.amdhsa_system_sgpr_workgroup_id_x 0
|
||||
.amdhsa_system_sgpr_workgroup_id_y 0
|
||||
.amdhsa_system_sgpr_workgroup_id_z 0
|
||||
.amdhsa_system_sgpr_workgroup_info 0
|
||||
.amdhsa_system_vgpr_workitem_id 0
|
||||
.amdhsa_exception_fp_ieee_invalid_op 0
|
||||
.amdhsa_exception_fp_denorm_src 0
|
||||
.amdhsa_exception_fp_ieee_div_zero 0
|
||||
.amdhsa_exception_fp_ieee_overflow 0
|
||||
.amdhsa_exception_fp_ieee_underflow 0
|
||||
.amdhsa_exception_fp_ieee_inexact 0
|
||||
.amdhsa_exception_int_div_zero 0
|
||||
.amdhsa_user_sgpr_private_segment_buffer 0
|
||||
.amdhsa_user_sgpr_dispatch_ptr 0
|
||||
.amdhsa_user_sgpr_queue_ptr 0
|
||||
.amdhsa_user_sgpr_kernarg_segment_ptr 0
|
||||
.amdhsa_user_sgpr_dispatch_id 0
|
||||
.amdhsa_user_sgpr_flat_scratch_init 0
|
||||
.amdhsa_user_sgpr_private_segment_size 0
|
||||
.amdhsa_wavefront_size32 0
|
||||
.end_amdhsa_kernel
|
|
@ -1,53 +0,0 @@
|
|||
;; Entirely zeroed kernel descriptor (for GFX9).
|
||||
|
||||
; RUN: llvm-mc %s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t1
|
||||
; RUN: llvm-objdump --disassemble-symbols=my_kernel.kd %t1 \
|
||||
; RUN: | tail -n +8 | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t2
|
||||
; RUN: diff %t1 %t2
|
||||
|
||||
; RUN: llvm-objdump -s -j .text %t1 | FileCheck --check-prefix=OBJDUMP %s
|
||||
|
||||
; OBJDUMP: 0000 00000000 00000000 00000000 00000000
|
||||
; OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
|
||||
; OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
|
||||
; OBJDUMP-NEXT: 0030 00000000 00000000 00000000 00000000
|
||||
|
||||
;; This file and kd-zeroed-raw.s produce the same output for the kernel
|
||||
;; descriptor - a block of 64 zeroed bytes.
|
||||
|
||||
.amdhsa_kernel my_kernel
|
||||
.amdhsa_group_segment_fixed_size 0
|
||||
.amdhsa_private_segment_fixed_size 0
|
||||
.amdhsa_next_free_vgpr 0
|
||||
.amdhsa_reserve_vcc 0
|
||||
.amdhsa_reserve_flat_scratch 0
|
||||
.amdhsa_reserve_xnack_mask 0
|
||||
.amdhsa_next_free_sgpr 0
|
||||
.amdhsa_float_round_mode_32 0
|
||||
.amdhsa_float_round_mode_16_64 0
|
||||
.amdhsa_float_denorm_mode_32 0
|
||||
.amdhsa_float_denorm_mode_16_64 0
|
||||
.amdhsa_dx10_clamp 0
|
||||
.amdhsa_ieee_mode 0
|
||||
.amdhsa_fp16_overflow 0
|
||||
.amdhsa_system_sgpr_private_segment_wavefront_offset 0
|
||||
.amdhsa_system_sgpr_workgroup_id_x 0
|
||||
.amdhsa_system_sgpr_workgroup_id_y 0
|
||||
.amdhsa_system_sgpr_workgroup_id_z 0
|
||||
.amdhsa_system_sgpr_workgroup_info 0
|
||||
.amdhsa_system_vgpr_workitem_id 0
|
||||
.amdhsa_exception_fp_ieee_invalid_op 0
|
||||
.amdhsa_exception_fp_denorm_src 0
|
||||
.amdhsa_exception_fp_ieee_div_zero 0
|
||||
.amdhsa_exception_fp_ieee_overflow 0
|
||||
.amdhsa_exception_fp_ieee_underflow 0
|
||||
.amdhsa_exception_fp_ieee_inexact 0
|
||||
.amdhsa_exception_int_div_zero 0
|
||||
.amdhsa_user_sgpr_private_segment_buffer 0
|
||||
.amdhsa_user_sgpr_dispatch_ptr 0
|
||||
.amdhsa_user_sgpr_queue_ptr 0
|
||||
.amdhsa_user_sgpr_kernarg_segment_ptr 0
|
||||
.amdhsa_user_sgpr_dispatch_id 0
|
||||
.amdhsa_user_sgpr_flat_scratch_init 0
|
||||
.amdhsa_user_sgpr_private_segment_size 0
|
||||
.end_amdhsa_kernel
|
|
@ -1,41 +0,0 @@
|
|||
; RUN: llvm-mc %s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t1
|
||||
; RUN: llvm-objdump --disassemble-symbols=my_kernel.kd %t1 \
|
||||
; RUN: | tail -n +8 | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t2
|
||||
; RUN: llvm-objdump -s -j .text %t2 | FileCheck --check-prefix=OBJDUMP %s
|
||||
|
||||
;; Not running lit-test over gfx10 (see kd-zeroed-gfx10.s for details).
|
||||
;; kd-zeroed-raw.s and kd-zeroed-*.s should produce the same output for the
|
||||
;; kernel descriptor - a block of 64 zeroed bytes.
|
||||
|
||||
;; The disassembly will produce the contents of kd-zeroed-*.s which on being
|
||||
;; assembled contains additional relocation info. A diff over the entire object
|
||||
;; will fail in this case. So we check by looking the bytes in .text.
|
||||
|
||||
; OBJDUMP: 0000 00000000 00000000 00000000 00000000
|
||||
; OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
|
||||
; OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
|
||||
; OBJDUMP-NEXT: 0030 00000000 00000000 00000000 00000000
|
||||
|
||||
;; The entire object is zeroed out.
|
||||
|
||||
.type my_kernel.kd, @object
|
||||
.size my_kernel.kd, 64
|
||||
my_kernel.kd:
|
||||
.long 0x00000000 ;; group_segment_fixed_size
|
||||
.long 0x00000000 ;; private_segment_fixed_size
|
||||
.quad 0x0000000000000000 ;; reserved bytes.
|
||||
.quad 0x0000000000000000 ;; kernel_code_entry_byte_offset, any value works.
|
||||
|
||||
;; 20 reserved bytes.
|
||||
.quad 0x0000000000000000
|
||||
.quad 0x0000000000000000
|
||||
.long 0x00000000
|
||||
|
||||
.long 0x00000000 ;; compute_PGM_RSRC3
|
||||
.long 0x00000000 ;; compute_PGM_RSRC1
|
||||
.long 0x00000000 ;; compute_PGM_RSRC2
|
||||
.short 0x0000 ;; additional fields.
|
||||
|
||||
;; 6 reserved bytes.
|
||||
.long 0x0000000
|
||||
.short 0x0000
|
|
@ -1854,6 +1854,23 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
|
|||
outs() << SectionName << ":\n";
|
||||
}
|
||||
|
||||
if (Obj->isELF() && Obj->getArch() == Triple::amdgcn) {
|
||||
if (Symbols[SI].Type == ELF::STT_AMDGPU_HSA_KERNEL) {
|
||||
// skip amd_kernel_code_t at the begining of kernel symbol (256 bytes)
|
||||
Start += 256;
|
||||
}
|
||||
if (SI == SE - 1 ||
|
||||
Symbols[SI + 1].Type == ELF::STT_AMDGPU_HSA_KERNEL) {
|
||||
// cut trailing zeroes at the end of kernel
|
||||
// cut up to 256 bytes
|
||||
const uint64_t EndAlign = 256;
|
||||
const auto Limit = End - (std::min)(EndAlign, End - Start);
|
||||
while (End > Limit &&
|
||||
*reinterpret_cast<const support::ulittle32_t*>(&Bytes[End - 4]) == 0)
|
||||
End -= 4;
|
||||
}
|
||||
}
|
||||
|
||||
outs() << '\n';
|
||||
if (!NoLeadingAddr)
|
||||
outs() << format(Is64Bits ? "%016" PRIx64 " " : "%08" PRIx64 " ",
|
||||
|
|
Loading…
Reference in New Issue