drm/amdgpu: support reset flag set for gpu reset
Move reset_context out of gpu recover function to make it configurable for different reset purpose. For the reset way of call gpu_recovery sysfs, force to use full reset method. Otherwise, try soft reset by default if the related ASIC supportted, if soft reset failed, will use full reset. Signed-off-by: Likun Gao <Likun.Gao@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
58e969b60d
commit
f1549c09c5
|
@ -1253,9 +1253,8 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
|
|||
bool amdgpu_device_has_job_running(struct amdgpu_device *adev);
|
||||
bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev);
|
||||
int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||
struct amdgpu_job* job);
|
||||
int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||
struct amdgpu_job *job);
|
||||
struct amdgpu_job *job,
|
||||
struct amdgpu_reset_context *reset_context);
|
||||
void amdgpu_device_pci_config_reset(struct amdgpu_device *adev);
|
||||
int amdgpu_device_pci_reset(struct amdgpu_device *adev);
|
||||
bool amdgpu_device_need_post(struct amdgpu_device *adev);
|
||||
|
|
|
@ -129,7 +129,14 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work)
|
|||
struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
|
||||
kfd.reset_work);
|
||||
|
||||
amdgpu_device_gpu_recover(adev, NULL);
|
||||
struct amdgpu_reset_context reset_context;
|
||||
memset(&reset_context, 0, sizeof(reset_context));
|
||||
|
||||
reset_context.method = AMD_RESET_METHOD_NONE;
|
||||
reset_context.reset_req_dev = adev;
|
||||
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
||||
|
||||
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
|
||||
}
|
||||
|
||||
void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
|
||||
|
|
|
@ -5109,7 +5109,8 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
|
|||
*/
|
||||
|
||||
int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||
struct amdgpu_job *job)
|
||||
struct amdgpu_job *job,
|
||||
struct amdgpu_reset_context *reset_context)
|
||||
{
|
||||
struct list_head device_list, *device_list_handle = NULL;
|
||||
bool job_signaled = false;
|
||||
|
@ -5119,9 +5120,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
|||
bool need_emergency_restart = false;
|
||||
bool audio_suspended = false;
|
||||
int tmp_vram_lost_counter;
|
||||
struct amdgpu_reset_context reset_context;
|
||||
|
||||
memset(&reset_context, 0, sizeof(reset_context));
|
||||
|
||||
/*
|
||||
* Special case: RAS triggered and full reset isn't supported
|
||||
|
@ -5147,12 +5145,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
|||
if (hive)
|
||||
mutex_lock(&hive->hive_lock);
|
||||
|
||||
reset_context.method = AMD_RESET_METHOD_NONE;
|
||||
reset_context.reset_req_dev = adev;
|
||||
reset_context.job = job;
|
||||
reset_context.hive = hive;
|
||||
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
||||
|
||||
reset_context->job = job;
|
||||
reset_context->hive = hive;
|
||||
/*
|
||||
* Build list of devices to reset.
|
||||
* In case we are in XGMI hive mode, resort the device list
|
||||
|
@ -5245,7 +5239,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
|||
|
||||
retry: /* Rest of adevs pre asic reset from XGMI hive. */
|
||||
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
|
||||
r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context);
|
||||
r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
|
||||
/*TODO Should we stop ?*/
|
||||
if (r) {
|
||||
dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
|
||||
|
@ -5272,7 +5266,7 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
|
|||
if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
|
||||
amdgpu_ras_resume(adev);
|
||||
} else {
|
||||
r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
|
||||
r = amdgpu_do_asic_reset(device_list_handle, reset_context);
|
||||
if (r && r == -EAGAIN)
|
||||
goto retry;
|
||||
}
|
||||
|
@ -5292,7 +5286,7 @@ skip_hw_reset:
|
|||
if (amdgpu_gpu_recovery == 2 &&
|
||||
!(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
|
||||
amdgpu_device_recheck_guilty_jobs(
|
||||
tmp_adev, device_list_handle, &reset_context);
|
||||
tmp_adev, device_list_handle, reset_context);
|
||||
|
||||
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
|
||||
struct amdgpu_ring *ring = tmp_adev->rings[i];
|
||||
|
|
|
@ -844,7 +844,14 @@ static void amdgpu_debugfs_reset_work(struct work_struct *work)
|
|||
struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
|
||||
reset_work);
|
||||
|
||||
amdgpu_device_gpu_recover(adev, NULL);
|
||||
struct amdgpu_reset_context reset_context;
|
||||
memset(&reset_context, 0, sizeof(reset_context));
|
||||
|
||||
reset_context.method = AMD_RESET_METHOD_NONE;
|
||||
reset_context.reset_req_dev = adev;
|
||||
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
||||
|
||||
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -29,6 +29,7 @@
|
|||
|
||||
#include "amdgpu.h"
|
||||
#include "amdgpu_trace.h"
|
||||
#include "amdgpu_reset.h"
|
||||
|
||||
static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
|
||||
{
|
||||
|
@ -64,7 +65,14 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
|
|||
ti.process_name, ti.tgid, ti.task_name, ti.pid);
|
||||
|
||||
if (amdgpu_device_should_recover_gpu(ring->adev)) {
|
||||
r = amdgpu_device_gpu_recover(ring->adev, job);
|
||||
struct amdgpu_reset_context reset_context;
|
||||
memset(&reset_context, 0, sizeof(reset_context));
|
||||
|
||||
reset_context.method = AMD_RESET_METHOD_NONE;
|
||||
reset_context.reset_req_dev = adev;
|
||||
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
||||
|
||||
r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
|
||||
if (r)
|
||||
DRM_ERROR("GPU Recovery Failed: %d\n", r);
|
||||
} else {
|
||||
|
|
|
@ -1942,8 +1942,16 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
|
|||
amdgpu_put_xgmi_hive(hive);
|
||||
}
|
||||
|
||||
if (amdgpu_device_should_recover_gpu(ras->adev))
|
||||
amdgpu_device_gpu_recover(ras->adev, NULL);
|
||||
if (amdgpu_device_should_recover_gpu(ras->adev)) {
|
||||
struct amdgpu_reset_context reset_context;
|
||||
memset(&reset_context, 0, sizeof(reset_context));
|
||||
|
||||
reset_context.method = AMD_RESET_METHOD_NONE;
|
||||
reset_context.reset_req_dev = adev;
|
||||
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
||||
|
||||
amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
|
||||
}
|
||||
atomic_set(&ras->in_recovery, 0);
|
||||
}
|
||||
|
||||
|
|
|
@ -283,8 +283,16 @@ flr_done:
|
|||
/* Trigger recovery for world switch failure if no TDR */
|
||||
if (amdgpu_device_should_recover_gpu(adev)
|
||||
&& (!amdgpu_device_has_job_running(adev) ||
|
||||
adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
|
||||
amdgpu_device_gpu_recover(adev, NULL);
|
||||
adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)) {
|
||||
struct amdgpu_reset_context reset_context;
|
||||
memset(&reset_context, 0, sizeof(reset_context));
|
||||
|
||||
reset_context.method = AMD_RESET_METHOD_NONE;
|
||||
reset_context.reset_req_dev = adev;
|
||||
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
||||
|
||||
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
|
||||
}
|
||||
}
|
||||
|
||||
static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
|
||||
|
|
|
@ -310,8 +310,16 @@ flr_done:
|
|||
adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
|
||||
adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
|
||||
adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
|
||||
adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
|
||||
amdgpu_device_gpu_recover(adev, NULL);
|
||||
adev->video_timeout == MAX_SCHEDULE_TIMEOUT)) {
|
||||
struct amdgpu_reset_context reset_context;
|
||||
memset(&reset_context, 0, sizeof(reset_context));
|
||||
|
||||
reset_context.method = AMD_RESET_METHOD_NONE;
|
||||
reset_context.reset_req_dev = adev;
|
||||
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
||||
|
||||
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
|
||||
}
|
||||
}
|
||||
|
||||
static int xgpu_nv_set_mailbox_rcv_irq(struct amdgpu_device *adev,
|
||||
|
|
|
@ -522,8 +522,16 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
|
|||
}
|
||||
|
||||
/* Trigger recovery due to world switch failure */
|
||||
if (amdgpu_device_should_recover_gpu(adev))
|
||||
amdgpu_device_gpu_recover(adev, NULL);
|
||||
if (amdgpu_device_should_recover_gpu(adev)) {
|
||||
struct amdgpu_reset_context reset_context;
|
||||
memset(&reset_context, 0, sizeof(reset_context));
|
||||
|
||||
reset_context.method = AMD_RESET_METHOD_NONE;
|
||||
reset_context.reset_req_dev = adev;
|
||||
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
||||
|
||||
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
|
||||
}
|
||||
}
|
||||
|
||||
static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,
|
||||
|
|
Loading…
Reference in New Issue