drm/amdgpu: Implement concurrent asic reset for XGMI.
Use per hive wq to concurrently send reset commands to all nodes in the hive. v2: Switch to system_highpri_wq after dropping dedicated queue. Fix non XGMI code path KASAN error. Stop the hive reset for each node loop if there is a reset failure on any of the nodes. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Acked-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
a82400b57a
commit
d4535e2c01
|
@ -910,7 +910,9 @@ struct amdgpu_device {
|
||||||
bool in_gpu_reset;
|
bool in_gpu_reset;
|
||||||
struct mutex lock_reset;
|
struct mutex lock_reset;
|
||||||
struct amdgpu_doorbell_index doorbell_index;
|
struct amdgpu_doorbell_index doorbell_index;
|
||||||
|
|
||||||
int asic_reset_res;
|
int asic_reset_res;
|
||||||
|
struct work_struct xgmi_reset_work;
|
||||||
};
|
};
|
||||||
|
|
||||||
static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
|
static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
|
||||||
|
|
|
@ -2356,6 +2356,19 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
|
||||||
return amdgpu_device_asic_has_dc_support(adev->asic_type);
|
return amdgpu_device_asic_has_dc_support(adev->asic_type);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
|
||||||
|
{
|
||||||
|
struct amdgpu_device *adev =
|
||||||
|
container_of(__work, struct amdgpu_device, xgmi_reset_work);
|
||||||
|
|
||||||
|
adev->asic_reset_res = amdgpu_asic_reset(adev);
|
||||||
|
if (adev->asic_reset_res)
|
||||||
|
DRM_WARN("ASIC reset failed with err r, %d for drm dev, %s",
|
||||||
|
adev->asic_reset_res, adev->ddev->unique);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* amdgpu_device_init - initialize the driver
|
* amdgpu_device_init - initialize the driver
|
||||||
*
|
*
|
||||||
|
@ -2454,6 +2467,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
|
||||||
INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
|
INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
|
||||||
amdgpu_device_delay_enable_gfx_off);
|
amdgpu_device_delay_enable_gfx_off);
|
||||||
|
|
||||||
|
INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
|
||||||
|
|
||||||
adev->gfx.gfx_off_req_count = 1;
|
adev->gfx.gfx_off_req_count = 1;
|
||||||
adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false;
|
adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false;
|
||||||
|
|
||||||
|
@ -3331,10 +3346,31 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
|
||||||
*/
|
*/
|
||||||
if (need_full_reset) {
|
if (need_full_reset) {
|
||||||
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
|
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
|
||||||
r = amdgpu_asic_reset(tmp_adev);
|
/* For XGMI run all resets in parallel to speed up the process */
|
||||||
if (r)
|
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
|
||||||
DRM_WARN("ASIC reset failed with err r, %d for drm dev, %s",
|
if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
|
||||||
|
r = -EALREADY;
|
||||||
|
} else
|
||||||
|
r = amdgpu_asic_reset(tmp_adev);
|
||||||
|
|
||||||
|
if (r) {
|
||||||
|
DRM_ERROR("ASIC reset failed with err r, %d for drm dev, %s",
|
||||||
r, tmp_adev->ddev->unique);
|
r, tmp_adev->ddev->unique);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* For XGMI wait for all PSP resets to complete before proceed */
|
||||||
|
if (!r) {
|
||||||
|
list_for_each_entry(tmp_adev, device_list_handle,
|
||||||
|
gmc.xgmi.head) {
|
||||||
|
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
|
||||||
|
flush_work(&tmp_adev->xgmi_reset_work);
|
||||||
|
r = tmp_adev->asic_reset_res;
|
||||||
|
if (r)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3521,8 +3557,6 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
|
||||||
if (tmp_adev == adev)
|
if (tmp_adev == adev)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
dev_info(tmp_adev->dev, "GPU reset begin for drm dev %s!\n", adev->ddev->unique);
|
|
||||||
|
|
||||||
amdgpu_device_lock_adev(tmp_adev);
|
amdgpu_device_lock_adev(tmp_adev);
|
||||||
r = amdgpu_device_pre_asic_reset(tmp_adev,
|
r = amdgpu_device_pre_asic_reset(tmp_adev,
|
||||||
NULL,
|
NULL,
|
||||||
|
|
Loading…
Reference in New Issue