drm/amdkfd: Improve HWS hang detection and handling
Move HWS hang detection into unmap_queues_cpsch to catch hangs in all cases. If this happens during a reset, don't schedule another reset because the reset already in progress is expected to take care of it. Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> Tested-by: Emily Deng <Emily.Deng@amd.com> Reviewed-by: shaoyunl <shaoyun.liu@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
63e088acfc
commit
09c34e8d7a
|
@ -728,6 +728,9 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
|
|||
{
|
||||
if (!kfd->init_complete)
|
||||
return 0;
|
||||
|
||||
kfd->dqm->ops.pre_reset(kfd->dqm);
|
||||
|
||||
kgd2kfd_suspend(kfd);
|
||||
|
||||
kfd_signal_reset_event(kfd);
|
||||
|
|
|
@ -952,6 +952,13 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static void pre_reset(struct device_queue_manager *dqm)
|
||||
{
|
||||
dqm_lock(dqm);
|
||||
dqm->is_resetting = true;
|
||||
dqm_unlock(dqm);
|
||||
}
|
||||
|
||||
static int allocate_sdma_queue(struct device_queue_manager *dqm,
|
||||
struct queue *q)
|
||||
{
|
||||
|
@ -1099,6 +1106,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
|
|||
dqm_lock(dqm);
|
||||
/* clear hang status when driver try to start the hw scheduler */
|
||||
dqm->is_hws_hang = false;
|
||||
dqm->is_resetting = false;
|
||||
dqm->sched_running = true;
|
||||
execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
|
||||
dqm_unlock(dqm);
|
||||
|
@ -1351,8 +1359,17 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
|
|||
/* should be timed out */
|
||||
retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED,
|
||||
queue_preemption_timeout_ms);
|
||||
if (retval)
|
||||
if (retval) {
|
||||
pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
|
||||
dqm->is_hws_hang = true;
|
||||
/* It's possible we're detecting a HWS hang in the
|
||||
* middle of a GPU reset. No need to schedule another
|
||||
* reset in this case.
|
||||
*/
|
||||
if (!dqm->is_resetting)
|
||||
schedule_work(&dqm->hw_exception_work);
|
||||
return retval;
|
||||
}
|
||||
|
||||
pm_release_ib(&dqm->packets);
|
||||
dqm->active_runlist = false;
|
||||
|
@ -1370,12 +1387,8 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm,
|
|||
if (dqm->is_hws_hang)
|
||||
return -EIO;
|
||||
retval = unmap_queues_cpsch(dqm, filter, filter_param);
|
||||
if (retval) {
|
||||
pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
|
||||
dqm->is_hws_hang = true;
|
||||
schedule_work(&dqm->hw_exception_work);
|
||||
if (retval)
|
||||
return retval;
|
||||
}
|
||||
|
||||
return map_queues_cpsch(dqm);
|
||||
}
|
||||
|
@ -1769,6 +1782,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
|
|||
dqm->ops.initialize = initialize_cpsch;
|
||||
dqm->ops.start = start_cpsch;
|
||||
dqm->ops.stop = stop_cpsch;
|
||||
dqm->ops.pre_reset = pre_reset;
|
||||
dqm->ops.destroy_queue = destroy_queue_cpsch;
|
||||
dqm->ops.update_queue = update_queue;
|
||||
dqm->ops.register_process = register_process;
|
||||
|
@ -1787,6 +1801,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
|
|||
/* initialize dqm for no cp scheduling */
|
||||
dqm->ops.start = start_nocpsch;
|
||||
dqm->ops.stop = stop_nocpsch;
|
||||
dqm->ops.pre_reset = pre_reset;
|
||||
dqm->ops.create_queue = create_queue_nocpsch;
|
||||
dqm->ops.destroy_queue = destroy_queue_nocpsch;
|
||||
dqm->ops.update_queue = update_queue;
|
||||
|
|
|
@ -104,6 +104,7 @@ struct device_queue_manager_ops {
|
|||
int (*initialize)(struct device_queue_manager *dqm);
|
||||
int (*start)(struct device_queue_manager *dqm);
|
||||
int (*stop)(struct device_queue_manager *dqm);
|
||||
void (*pre_reset)(struct device_queue_manager *dqm);
|
||||
void (*uninitialize)(struct device_queue_manager *dqm);
|
||||
int (*create_kernel_queue)(struct device_queue_manager *dqm,
|
||||
struct kernel_queue *kq,
|
||||
|
@ -198,6 +199,7 @@ struct device_queue_manager {
|
|||
|
||||
/* hw exception */
|
||||
bool is_hws_hang;
|
||||
bool is_resetting;
|
||||
struct work_struct hw_exception_work;
|
||||
struct kfd_mem_obj hiq_sdma_mqd;
|
||||
bool sched_running;
|
||||
|
|
Loading…
Reference in New Issue