drm/amdkfd: reset queue which consumes RAS poison (v2)

CP supports unmap queue with reset mode which only destroys specific queue without affecting others.
Replacing whole gpu reset with reset queue mode for RAS poison consumption
saves much time, and we can also fallback to gpu reset solution if reset
queue fails.

v2: Return directly if process is NULL;
    Reset queue solution is not applicable to SDMA, fallback to legacy
way;
    Call kfd_unref_process after lookup process.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Tao Zhou 2021-12-06 15:54:54 +08:00 committed by Alex Deucher
parent dec6344338
commit b6485bed40
4 changed files with 47 additions and 8 deletions

View File

@ -721,13 +721,13 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev)
return adev->have_atomics_support;
}
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev)
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset)
{
struct ras_err_data err_data = {0, 0, 0, NULL};
/* CPU MCA will handle page retirement if connected_to_cpu is 1 */
if (!adev->gmc.xgmi.connected_to_cpu)
amdgpu_umc_process_ras_data_cb(adev, &err_data, NULL);
else
amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
else if (reset)
amdgpu_amdkfd_gpu_reset(adev);
}

View File

@ -296,7 +296,8 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev,
uint64_t *mmap_offset);
int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
struct tile_config *config);
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev);
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
bool reset);
#if IS_ENABLED(CONFIG_HSA_AMD)
void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,

View File

@ -89,6 +89,44 @@ enum SQ_INTERRUPT_ERROR_TYPE {
#define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF00000
#define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20
static void event_interrupt_poison_consumption(struct kfd_dev *dev,
uint16_t pasid, uint16_t source_id)
{
int ret = -EINVAL;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
if (!p)
return;
/* all queues of a process will be unmapped in one time */
if (atomic_read(&p->poison)) {
kfd_unref_process(p);
return;
}
atomic_set(&p->poison, 1);
kfd_unref_process(p);
switch (source_id) {
case SOC15_INTSRC_SQ_INTERRUPT_MSG:
if (dev->dqm->ops.reset_queues)
ret = dev->dqm->ops.reset_queues(dev->dqm, pasid);
break;
case SOC15_INTSRC_SDMA_ECC:
default:
break;
}
kfd_signal_poison_consumed_event(dev, pasid);
/* resetting queue passes, do page retirement without gpu reset
resetting queue fails, fallback to gpu reset solution */
if (!ret)
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
else
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
}
static bool event_interrupt_isr_v9(struct kfd_dev *dev,
const uint32_t *ih_ring_entry,
uint32_t *patched_ihre,
@ -230,8 +268,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
sq_intr_err);
if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
kfd_signal_poison_consumed_event(dev, pasid);
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev);
event_interrupt_poison_consumption(dev, pasid, source_id);
return;
}
break;
@ -252,8 +289,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
if (source_id == SOC15_INTSRC_SDMA_TRAP) {
kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
} else if (source_id == SOC15_INTSRC_SDMA_ECC) {
kfd_signal_poison_consumed_event(dev, pasid);
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev);
event_interrupt_poison_consumption(dev, pasid, source_id);
return;
}
} else if (client_id == SOC15_IH_CLIENTID_VMC ||

View File

@ -856,6 +856,8 @@ struct kfd_process {
struct svm_range_list svms;
bool xnack_enabled;
atomic_t poison;
};
#define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */