drm/amdkfd: reset queue which consumes RAS poison (v2)
CP supports unmap queue with reset mode which only destroys specific queue without affecting others. Replacing whole gpu reset with reset queue mode for RAS poison consumption saves much time, and we can also fallback to gpu reset solution if reset queue fails. v2: Return directly if process is NULL; Reset queue solution is not applicable to SDMA, fallback to legacy way; Call kfd_unref_process after lookup process. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Acked-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
dec6344338
commit
b6485bed40
|
@ -721,13 +721,13 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev)
|
|||
return adev->have_atomics_support;
|
||||
}
|
||||
|
||||
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev)
|
||||
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset)
|
||||
{
|
||||
struct ras_err_data err_data = {0, 0, 0, NULL};
|
||||
|
||||
/* CPU MCA will handle page retirement if connected_to_cpu is 1 */
|
||||
if (!adev->gmc.xgmi.connected_to_cpu)
|
||||
amdgpu_umc_process_ras_data_cb(adev, &err_data, NULL);
|
||||
else
|
||||
amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
|
||||
else if (reset)
|
||||
amdgpu_amdkfd_gpu_reset(adev);
|
||||
}
|
||||
|
|
|
@ -296,7 +296,8 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev,
|
|||
uint64_t *mmap_offset);
|
||||
int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
|
||||
struct tile_config *config);
|
||||
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev);
|
||||
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
|
||||
bool reset);
|
||||
#if IS_ENABLED(CONFIG_HSA_AMD)
|
||||
void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
|
||||
void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
|
||||
|
|
|
@ -89,6 +89,44 @@ enum SQ_INTERRUPT_ERROR_TYPE {
|
|||
#define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF00000
|
||||
#define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20
|
||||
|
||||
static void event_interrupt_poison_consumption(struct kfd_dev *dev,
|
||||
uint16_t pasid, uint16_t source_id)
|
||||
{
|
||||
int ret = -EINVAL;
|
||||
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
|
||||
|
||||
if (!p)
|
||||
return;
|
||||
|
||||
/* all queues of a process will be unmapped in one time */
|
||||
if (atomic_read(&p->poison)) {
|
||||
kfd_unref_process(p);
|
||||
return;
|
||||
}
|
||||
|
||||
atomic_set(&p->poison, 1);
|
||||
kfd_unref_process(p);
|
||||
|
||||
switch (source_id) {
|
||||
case SOC15_INTSRC_SQ_INTERRUPT_MSG:
|
||||
if (dev->dqm->ops.reset_queues)
|
||||
ret = dev->dqm->ops.reset_queues(dev->dqm, pasid);
|
||||
break;
|
||||
case SOC15_INTSRC_SDMA_ECC:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
kfd_signal_poison_consumed_event(dev, pasid);
|
||||
|
||||
/* resetting queue passes, do page retirement without gpu reset
|
||||
resetting queue fails, fallback to gpu reset solution */
|
||||
if (!ret)
|
||||
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
|
||||
else
|
||||
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
|
||||
}
|
||||
|
||||
static bool event_interrupt_isr_v9(struct kfd_dev *dev,
|
||||
const uint32_t *ih_ring_entry,
|
||||
uint32_t *patched_ihre,
|
||||
|
@ -230,8 +268,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
|
|||
sq_intr_err);
|
||||
if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
|
||||
sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
|
||||
kfd_signal_poison_consumed_event(dev, pasid);
|
||||
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev);
|
||||
event_interrupt_poison_consumption(dev, pasid, source_id);
|
||||
return;
|
||||
}
|
||||
break;
|
||||
|
@ -252,8 +289,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
|
|||
if (source_id == SOC15_INTSRC_SDMA_TRAP) {
|
||||
kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
|
||||
} else if (source_id == SOC15_INTSRC_SDMA_ECC) {
|
||||
kfd_signal_poison_consumed_event(dev, pasid);
|
||||
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev);
|
||||
event_interrupt_poison_consumption(dev, pasid, source_id);
|
||||
return;
|
||||
}
|
||||
} else if (client_id == SOC15_IH_CLIENTID_VMC ||
|
||||
|
|
|
@ -856,6 +856,8 @@ struct kfd_process {
|
|||
struct svm_range_list svms;
|
||||
|
||||
bool xnack_enabled;
|
||||
|
||||
atomic_t poison;
|
||||
};
|
||||
|
||||
#define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
|
||||
|
|
Loading…
Reference in New Issue