drm/amdkfd: apply trap workaround for gfx11

Due to a HW bug, waves in only half the shader arrays can enter trap.

When starting a debug session, relocate all waves to the first shader
array of each shader engine and mask off the 2nd shader array as
unavailable.

When ending a debug session, re-enable the 2nd shader array per
shader engine.

User CU masking per queue cannot be guaranteed to remain functional
if requested during debugging (e.g. user cu mask requests only 2nd shader
array as an available resource leading to zero HW resources available)
nor can runtime be alerted of any of these changes during execution.

Make user CU masking and debugging mutual exclusive with respect to
availability.

If the debugger tries to attach to a process with a user cu masked
queue, return the runtime status as enabled but busy.

If the debugger tries to attach and fails to reallocate queue waves to
the first shader array of each shader engine, return the runtime status
as enabled but with an error.

In addition, like any other mutli-process debug supported devices,
disable trap temporary setup per-process to avoid performance impact from
setup overhead.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Jonathan Kim 2022-09-01 11:27:15 -04:00 committed by Alex Deucher
parent 218895820e
commit 69a8c3ae2d
14 changed files with 122 additions and 31 deletions

View File

@ -219,6 +219,8 @@ struct mes_add_queue_input {
uint32_t gws_size;
uint64_t tba_addr;
uint64_t tma_addr;
uint32_t trap_en;
uint32_t skip_process_ctx_clear;
uint32_t is_kfd_process;
uint32_t is_aql_queue;
uint32_t queue_size;

View File

@ -202,17 +202,14 @@ static int mes_v11_0_add_hw_queue(struct amdgpu_mes *mes,
mes_add_queue_pkt.gws_size = input->gws_size;
mes_add_queue_pkt.trap_handler_addr = input->tba_addr;
mes_add_queue_pkt.tma_addr = input->tma_addr;
mes_add_queue_pkt.trap_en = input->trap_en;
mes_add_queue_pkt.skip_process_ctx_clear = input->skip_process_ctx_clear;
mes_add_queue_pkt.is_kfd_process = input->is_kfd_process;
/* For KFD, gds_size is re-used for queue size (needed in MES for AQL queues) */
mes_add_queue_pkt.is_aql_queue = input->is_aql_queue;
mes_add_queue_pkt.gds_size = input->queue_size;
if (!(((adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 4) &&
(adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) &&
(adev->ip_versions[GC_HWIP][0] <= IP_VERSION(11, 0, 3))))
mes_add_queue_pkt.trap_en = 1;
/* For KFD, gds_size is re-used for queue size (needed in MES for AQL queues) */
mes_add_queue_pkt.is_aql_queue = input->is_aql_queue;
mes_add_queue_pkt.gds_size = input->queue_size;

View File

@ -537,8 +537,6 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p,
goto out;
}
minfo.update_flag = UPDATE_FLAG_CU_MASK;
mutex_lock(&p->mutex);
retval = pqm_update_mqd(&p->pqm, args->queue_id, &minfo);

View File

@ -24,6 +24,57 @@
#include "kfd_device_queue_manager.h"
#include <linux/file.h>
static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
{
struct mqd_update_info minfo = {0};
int err;
if (!q)
return 0;
if (KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) ||
KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0))
return 0;
if (enable && q->properties.is_user_cu_masked)
return -EBUSY;
minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
q->properties.is_dbg_wa = enable;
err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
if (err)
q->properties.is_dbg_wa = false;
return err;
}
static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
{
struct process_queue_manager *pqm = &target->pqm;
struct process_queue_node *pqn;
int r = 0;
list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
r = kfd_dbg_set_queue_workaround(pqn->q, enable);
if (enable && r)
goto unwind;
}
return 0;
unwind:
list_for_each_entry(pqn, &pqm->queues, process_queue_list)
kfd_dbg_set_queue_workaround(pqn->q, false);
if (enable)
target->runtime_info.runtime_state = r == -EBUSY ?
DEBUG_RUNTIME_STATE_ENABLED_BUSY :
DEBUG_RUNTIME_STATE_ENABLED_ERROR;
return r;
}
static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
{
uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
@ -77,6 +128,8 @@ static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int
else
kfd_dbg_set_mes_debug_mode(pdd);
}
kfd_dbg_set_workaround(target, false);
}
int kfd_dbg_trap_disable(struct kfd_process *target)
@ -111,6 +164,10 @@ static int kfd_dbg_trap_activate(struct kfd_process *target)
{
int i, r = 0;
r = kfd_dbg_set_workaround(target, true);
if (r)
return r;
for (i = 0; i < target->n_pdds; i++) {
struct kfd_process_device *pdd = target->pdds[i];

View File

@ -31,7 +31,8 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
uint32_t *runtime_info_size);
static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_node *dev)
{
return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2);
return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2) ||
KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0);
}
/*

View File

@ -226,6 +226,10 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
queue_input.paging = false;
queue_input.tba_addr = qpd->tba_addr;
queue_input.tma_addr = qpd->tma_addr;
queue_input.trap_en = KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) ||
KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0) ||
q->properties.is_dbg_wa;
queue_input.skip_process_ctx_clear = qpd->pqm->process->debug_trap_enabled;
queue_type = convert_to_mes_queue_type(q->properties.type);
if (queue_type < 0) {
@ -1716,6 +1720,9 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
* updates the is_evicted flag but is a no-op otherwise.
*/
q->properties.is_evicted = !!qpd->evicted;
q->properties.is_dbg_wa = qpd->pqm->process->debug_trap_enabled &&
KFD_GC_VERSION(q->device) >= IP_VERSION(11, 0, 0) &&
KFD_GC_VERSION(q->device) < IP_VERSION(12, 0, 0);
if (qd)
mqd_mgr->restore_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj, &q->gart_mqd_addr,

View File

@ -48,8 +48,7 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
struct cik_mqd *m;
uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
!minfo->cu_mask.ptr)
if (!minfo || !minfo->cu_mask.ptr)
return;
mqd_symmetrically_map_cu_mask(mm,

View File

@ -48,8 +48,7 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
struct v10_compute_mqd *m;
uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
!minfo->cu_mask.ptr)
if (!minfo || !minfo->cu_mask.ptr)
return;
mqd_symmetrically_map_cu_mask(mm,

View File

@ -46,15 +46,33 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
{
struct v11_compute_mqd *m;
uint32_t se_mask[KFD_MAX_NUM_SE] = {0};
bool has_wa_flag = minfo && (minfo->update_flag & (UPDATE_FLAG_DBG_WA_ENABLE |
UPDATE_FLAG_DBG_WA_DISABLE));
if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
!minfo->cu_mask.ptr)
if (!minfo || !(has_wa_flag || minfo->cu_mask.ptr))
return;
m = get_mqd(mqd);
if (has_wa_flag) {
uint32_t wa_mask = minfo->update_flag == UPDATE_FLAG_DBG_WA_ENABLE ?
0xffff : 0xffffffff;
m->compute_static_thread_mgmt_se0 = wa_mask;
m->compute_static_thread_mgmt_se1 = wa_mask;
m->compute_static_thread_mgmt_se2 = wa_mask;
m->compute_static_thread_mgmt_se3 = wa_mask;
m->compute_static_thread_mgmt_se4 = wa_mask;
m->compute_static_thread_mgmt_se5 = wa_mask;
m->compute_static_thread_mgmt_se6 = wa_mask;
m->compute_static_thread_mgmt_se7 = wa_mask;
return;
}
mqd_symmetrically_map_cu_mask(mm,
minfo->cu_mask.ptr, minfo->cu_mask.count, se_mask);
m = get_mqd(mqd);
m->compute_static_thread_mgmt_se0 = se_mask[0];
m->compute_static_thread_mgmt_se1 = se_mask[1];
m->compute_static_thread_mgmt_se2 = se_mask[2];
@ -109,6 +127,7 @@ static void init_mqd(struct mqd_manager *mm, void **mqd,
uint64_t addr;
struct v11_compute_mqd *m;
int size;
uint32_t wa_mask = q->is_dbg_wa ? 0xffff : 0xffffffff;
m = (struct v11_compute_mqd *) mqd_mem_obj->cpu_ptr;
addr = mqd_mem_obj->gpu_addr;
@ -122,14 +141,15 @@ static void init_mqd(struct mqd_manager *mm, void **mqd,
m->header = 0xC0310800;
m->compute_pipelinestat_enable = 1;
m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
m->compute_static_thread_mgmt_se4 = 0xFFFFFFFF;
m->compute_static_thread_mgmt_se5 = 0xFFFFFFFF;
m->compute_static_thread_mgmt_se6 = 0xFFFFFFFF;
m->compute_static_thread_mgmt_se7 = 0xFFFFFFFF;
m->compute_static_thread_mgmt_se0 = wa_mask;
m->compute_static_thread_mgmt_se1 = wa_mask;
m->compute_static_thread_mgmt_se2 = wa_mask;
m->compute_static_thread_mgmt_se3 = wa_mask;
m->compute_static_thread_mgmt_se4 = wa_mask;
m->compute_static_thread_mgmt_se5 = wa_mask;
m->compute_static_thread_mgmt_se6 = wa_mask;
m->compute_static_thread_mgmt_se7 = wa_mask;
m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK |
0x55 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT;

View File

@ -65,8 +65,7 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
struct v9_mqd *m;
uint32_t se_mask[KFD_MAX_NUM_SE] = {0};
if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
!minfo->cu_mask.ptr)
if (!minfo || !minfo->cu_mask.ptr)
return;
mqd_symmetrically_map_cu_mask(mm,

View File

@ -51,8 +51,7 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
struct vi_mqd *m;
uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
!minfo->cu_mask.ptr)
if (!minfo || !minfo->cu_mask.ptr)
return;
mqd_symmetrically_map_cu_mask(mm,

View File

@ -513,6 +513,8 @@ struct queue_properties {
bool is_active;
bool is_gws;
uint32_t pm4_target_xcc;
bool is_dbg_wa;
bool is_user_cu_masked;
/* Not relevant for user mode queues in cp scheduling */
unsigned int vmid;
/* Relevant only for sdma queues*/
@ -535,7 +537,8 @@ struct queue_properties {
!(q).is_evicted)
enum mqd_update_flag {
UPDATE_FLAG_CU_MASK = 0,
UPDATE_FLAG_DBG_WA_ENABLE = 1,
UPDATE_FLAG_DBG_WA_DISABLE = 2,
};
struct mqd_update_info {

View File

@ -506,8 +506,12 @@ int pqm_update_mqd(struct process_queue_manager *pqm,
return -EFAULT;
}
/* CUs are masked for debugger requirements so deny user mask */
if (pqn->q->properties.is_dbg_wa && minfo && minfo->cu_mask.ptr)
return -EBUSY;
/* ASICs that have WGPs must enforce pairwise enabled mask checks. */
if (minfo && minfo->update_flag == UPDATE_FLAG_CU_MASK && minfo->cu_mask.ptr &&
if (minfo && minfo->cu_mask.ptr &&
KFD_GC_VERSION(pqn->q->device) >= IP_VERSION(10, 0, 0)) {
int i;
@ -526,6 +530,9 @@ int pqm_update_mqd(struct process_queue_manager *pqm,
if (retval != 0)
return retval;
if (minfo && minfo->cu_mask.ptr)
pqn->q->properties.is_user_cu_masked = true;
return 0;
}

View File

@ -1863,10 +1863,13 @@ static void kfd_topology_set_dbg_firmware_support(struct kfd_topology_device *de
{
bool firmware_supported = true;
/*
* FIXME: GFX11 FW currently not sufficient to deal with CWSR WA.
* Updated FW with API changes coming soon.
*/
if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(11, 0, 0) &&
KFD_GC_VERSION(dev->gpu) < IP_VERSION(12, 0, 0)) {
firmware_supported =
(dev->gpu->adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 9;
firmware_supported = false;
goto out;
}