drm/amdgpu: Drop concurrent GPU reset protection for device
Since now all GPU resets are serialzied there is no need for this. This patch also reverts 'drm/amdgpu: race issue when jobs on 2 ring timeout' Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Reviewed-by: Christian König <christian.koenig@amd.com> Link: https://www.spinics.net/lists/amd-gfx/msg74119.html
This commit is contained in:
parent
681260df4d
commit
f287a3c5b0
|
@ -4817,11 +4817,10 @@ end:
|
|||
return r;
|
||||
}
|
||||
|
||||
static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
|
||||
static void amdgpu_device_lock_adev(struct amdgpu_device *adev,
|
||||
struct amdgpu_hive_info *hive)
|
||||
{
|
||||
if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
|
||||
return false;
|
||||
atomic_set(&adev->in_gpu_reset, 1);
|
||||
|
||||
if (hive) {
|
||||
down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
|
||||
|
@ -4840,8 +4839,6 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
|
|||
adev->mp1_state = PP_MP1_STATE_NONE;
|
||||
break;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
|
||||
|
@ -4852,46 +4849,6 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
|
|||
up_write(&adev->reset_sem);
|
||||
}
|
||||
|
||||
/*
|
||||
* to lockup a list of amdgpu devices in a hive safely, if not a hive
|
||||
* with multiple nodes, it will be similar as amdgpu_device_lock_adev.
|
||||
*
|
||||
* unlock won't require roll back.
|
||||
*/
|
||||
static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
|
||||
{
|
||||
struct amdgpu_device *tmp_adev = NULL;
|
||||
|
||||
if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
|
||||
if (!hive) {
|
||||
dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
|
||||
return -ENODEV;
|
||||
}
|
||||
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
|
||||
if (!amdgpu_device_lock_adev(tmp_adev, hive))
|
||||
goto roll_back;
|
||||
}
|
||||
} else if (!amdgpu_device_lock_adev(adev, hive))
|
||||
return -EAGAIN;
|
||||
|
||||
return 0;
|
||||
roll_back:
|
||||
if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
|
||||
/*
|
||||
* if the lockup iteration break in the middle of a hive,
|
||||
* it may means there may has a race issue,
|
||||
* or a hive device locked up independently.
|
||||
* we may be in trouble and may not, so will try to roll back
|
||||
* the lock and give out a warnning.
|
||||
*/
|
||||
dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
|
||||
list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
|
||||
amdgpu_device_unlock_adev(tmp_adev);
|
||||
}
|
||||
}
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
|
||||
{
|
||||
struct pci_dev *p = NULL;
|
||||
|
@ -5078,22 +5035,6 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
|
|||
reset_context.hive = hive;
|
||||
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
||||
|
||||
/*
|
||||
* lock the device before we try to operate the linked list
|
||||
* if didn't get the device lock, don't touch the linked list since
|
||||
* others may iterating it.
|
||||
*/
|
||||
r = amdgpu_device_lock_hive_adev(adev, hive);
|
||||
if (r) {
|
||||
dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
|
||||
job ? job->base.id : -1);
|
||||
|
||||
/* even we skipped this reset, still need to set the job to guilty */
|
||||
if (job && job->vm)
|
||||
drm_sched_increase_karma(&job->base);
|
||||
goto skip_recovery;
|
||||
}
|
||||
|
||||
/*
|
||||
* Build list of devices to reset.
|
||||
* In case we are in XGMI hive mode, resort the device list
|
||||
|
@ -5113,6 +5054,9 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
|
|||
|
||||
/* block all schedulers and reset given job's ring */
|
||||
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
|
||||
|
||||
amdgpu_device_lock_adev(tmp_adev, hive);
|
||||
|
||||
/*
|
||||
* Try to put the audio codec into suspend state
|
||||
* before gpu reset started.
|
||||
|
@ -5264,13 +5208,12 @@ skip_sched_resume:
|
|||
amdgpu_device_unlock_adev(tmp_adev);
|
||||
}
|
||||
|
||||
skip_recovery:
|
||||
if (hive) {
|
||||
mutex_unlock(&hive->hive_lock);
|
||||
amdgpu_put_xgmi_hive(hive);
|
||||
}
|
||||
|
||||
if (r && r != -EAGAIN)
|
||||
if (r)
|
||||
dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
|
||||
return r;
|
||||
}
|
||||
|
@ -5493,20 +5436,6 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
|
||||
struct amdgpu_ring *ring = adev->rings[i];
|
||||
|
||||
if (!ring || !ring->sched.thread)
|
||||
continue;
|
||||
|
||||
cancel_delayed_work_sync(&ring->sched.work_tdr);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_pci_error_detected - Called when a PCI error is detected.
|
||||
* @pdev: PCI device struct
|
||||
|
@ -5537,14 +5466,10 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
|
|||
/* Fatal error, prepare for slot reset */
|
||||
case pci_channel_io_frozen:
|
||||
/*
|
||||
* Cancel and wait for all TDRs in progress if failing to
|
||||
* set adev->in_gpu_reset in amdgpu_device_lock_adev
|
||||
*
|
||||
* Locking adev->reset_sem will prevent any external access
|
||||
* to GPU during PCI error recovery
|
||||
*/
|
||||
while (!amdgpu_device_lock_adev(adev, NULL))
|
||||
amdgpu_cancel_all_tdr(adev);
|
||||
amdgpu_device_lock_adev(adev, NULL);
|
||||
|
||||
/*
|
||||
* Block any work scheduling as we do for regular GPU reset
|
||||
|
|
Loading…
Reference in New Issue